[fix eval] Fix issues with miniwob remote runtime evaluation (#5001)

This commit is contained in:
Ketan Ramaneti
2024-11-14 13:00:48 -05:00
committed by GitHub
parent 42b49e6c43
commit 852c90f64a
5 changed files with 18 additions and 2 deletions

View File

@@ -16,6 +16,20 @@ Access with browser the above MiniWoB URLs and see if they load correctly.
./evaluation/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval
```
### Run Inference on `RemoteRuntime` (experimental)
This is in limited beta. Contact Xingyao over slack if you want to try this out!
```bash
./evaluation/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers]
# Example - This runs evaluation on BrowsingAgent for 125 instances on miniwob, with 2 workers running in parallel
export ALLHANDS_API_KEY="YOUR-API-KEY"
export RUNTIME=remote
export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
./evaluation/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2
```
Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
To calculate the average reward, run:

View File

@@ -23,7 +23,7 @@ if __name__ == '__main__':
data = json.loads(line)
actual_num += 1
total_cost += data['metrics']['accumulated_cost']
total_reward += data['test_result']
total_reward += data['test_result']['reward']
avg_reward = total_reward / total_num
print('Avg Reward: ', avg_reward)

View File

@@ -47,6 +47,7 @@ SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'}
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
'CodeActAgent': codeact_user_response,
'BrowsingAgent': 'Continue the task. IMPORTANT: do not talk to the user until you have finished the task',
}
@@ -66,6 +67,7 @@ def get_config(
browsergym_eval_env=env_id,
api_key=os.environ.get('ALLHANDS_API_KEY', None),
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
remote_runtime_init_timeout=1800,
keep_runtime_alive=False,
timeout=120,
remote_runtime_init_timeout=1800,

View File

@@ -33,7 +33,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
COMMAND="poetry run python evaluation/miniwob/run_infer.py \
COMMAND="export PYTHONPATH=evaluation/miniwob:\$PYTHONPATH && poetry run python evaluation/miniwob/run_infer.py \
--agent-cls $AGENT \
--llm-config $MODEL_CONFIG \
--max-iterations 10 \