A few fixes for TAC evaluation harness (#6586)

This commit is contained in:
Boxuan Li 2025-02-14 21:01:57 -08:00 committed by GitHub
parent efbff2e655
commit 4443417c75
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 17 additions and 5 deletions

View File

@ -267,7 +267,9 @@ def pre_login(
obs: BrowserOutputObservation = runtime.run_action(browser_action)
logger.debug(obs, extra={'msg_type': 'OBSERVATION'})
if save_screenshots:
image_data = base64.b64decode(obs.screenshot)
image_data = base64.b64decode(
obs.screenshot.replace('data:image/png;base64,', '')
)
with open(os.path.join(directory, f'{image_id}.png'), 'wb') as file:
file.write(image_data)
image_id += 1

View File

@ -36,7 +36,7 @@ def get_config(
task_short_name: str,
mount_path_on_host: str,
llm_config: LLMConfig,
agent_config: AgentConfig,
agent_config: AgentConfig | None,
) -> AppConfig:
config = AppConfig(
run_as_openhands=False,
@ -159,11 +159,21 @@ def run_solver(
os.makedirs(screenshots_dir, exist_ok=True)
for image_id, obs in enumerate(state.history):
if isinstance(obs, BrowserOutputObservation):
image_data = base64.b64decode(obs.screenshot)
image_data = base64.b64decode(
obs.screenshot.replace('data:image/png;base64,', '')
)
with open(
os.path.join(screenshots_dir, f'{image_id}.png'), 'wb'
) as file:
file.write(image_data)
if obs.set_of_marks:
som_image_data = base64.b64decode(
obs.set_of_marks.replace('data:image/png;base64,', '')
)
with open(
os.path.join(screenshots_dir, f'{image_id}_som.png'), 'wb'
) as file:
file.write(som_image_data)
if save_final_state:
os.makedirs(state_dir, exist_ok=True)

View File

@ -129,8 +129,6 @@ temp_file="tasks_${START_PERCENTILE}_${END_PERCENTILE}.md"
sed -n "${start_line},${end_line}p" tasks.md > "$temp_file"
while IFS= read -r task_image; do
docker pull $task_image
# Remove prefix using ## to remove longest matching pattern from start
task_name=${task_image##ghcr.io/theagentcompany/}
@ -144,6 +142,8 @@ while IFS= read -r task_image; do
continue
fi
docker pull $task_image
# Build the Python command
COMMAND="poetry run python run_infer.py \
--agent-llm-config \"$AGENT_LLM_CONFIG\" \