A few fixes for TAC evaluation harness (#6586)

2025-12-26 05:48:36 +08:00 · 2025-02-14 21:01:57 -08:00 · 2025-02-14 21:01:57 -08:00 · 4443417c75
commit 4443417c75
parent efbff2e655
3 changed files with 17 additions and 5 deletions
--- a/evaluation/benchmarks/the_agent_company/browsing.py
+++ b/evaluation/benchmarks/the_agent_company/browsing.py
@ -267,7 +267,9 @@ def pre_login(
            obs: BrowserOutputObservation = runtime.run_action(browser_action)
            logger.debug(obs, extra={'msg_type': 'OBSERVATION'})
            if save_screenshots:
-                image_data = base64.b64decode(obs.screenshot)
+                image_data = base64.b64decode(
+                    obs.screenshot.replace('data:image/png;base64,', '')
+                )
                with open(os.path.join(directory, f'{image_id}.png'), 'wb') as file:
                    file.write(image_data)
                    image_id += 1
--- a/evaluation/benchmarks/the_agent_company/run_infer.py
+++ b/evaluation/benchmarks/the_agent_company/run_infer.py
@ -36,7 +36,7 @@ def get_config(
    task_short_name: str,
    mount_path_on_host: str,
    llm_config: LLMConfig,
-    agent_config: AgentConfig,
+    agent_config: AgentConfig | None,
 ) -> AppConfig:
    config = AppConfig(
        run_as_openhands=False,
@ -159,11 +159,21 @@ def run_solver(
        os.makedirs(screenshots_dir, exist_ok=True)
        for image_id, obs in enumerate(state.history):
            if isinstance(obs, BrowserOutputObservation):
-                image_data = base64.b64decode(obs.screenshot)
+                image_data = base64.b64decode(
+                    obs.screenshot.replace('data:image/png;base64,', '')
+                )
                with open(
                    os.path.join(screenshots_dir, f'{image_id}.png'), 'wb'
                ) as file:
                    file.write(image_data)
+                if obs.set_of_marks:
+                    som_image_data = base64.b64decode(
+                        obs.set_of_marks.replace('data:image/png;base64,', '')
+                    )
+                    with open(
+                        os.path.join(screenshots_dir, f'{image_id}_som.png'), 'wb'
+                    ) as file:
+                        file.write(som_image_data)

    if save_final_state:
        os.makedirs(state_dir, exist_ok=True)
--- a/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh
+++ b/evaluation/benchmarks/the_agent_company/scripts/run_infer.sh
@ -129,8 +129,6 @@ temp_file="tasks_${START_PERCENTILE}_${END_PERCENTILE}.md"
 sed -n "${start_line},${end_line}p" tasks.md > "$temp_file"

 while IFS= read -r task_image; do
-    docker pull $task_image
-
    # Remove prefix using ## to remove longest matching pattern from start
    task_name=${task_image##ghcr.io/theagentcompany/}

@ -144,6 +142,8 @@ while IFS= read -r task_image; do
        continue
    fi

+    docker pull $task_image
+
    # Build the Python command
    COMMAND="poetry run python run_infer.py \
            --agent-llm-config \"$AGENT_LLM_CONFIG\" \