mirror of
https://github.com/OpenHands/OpenHands.git
synced 2026-03-22 13:47:19 +08:00
[fix eval] Fix issues with miniwob remote runtime evaluation (#5001)
This commit is contained in:
@@ -16,6 +16,20 @@ Access with browser the above MiniWoB URLs and see if they load correctly.
|
||||
./evaluation/miniwob/scripts/run_infer.sh llm.claude-35-sonnet-eval
|
||||
```
|
||||
|
||||
### Run Inference on `RemoteRuntime` (experimental)
|
||||
|
||||
This is in limited beta. Contact Xingyao over slack if you want to try this out!
|
||||
|
||||
```bash
|
||||
./evaluation/miniwob/scripts/run_infer.sh [model_config] [git-version] [agent] [note] [eval_limit] [num_workers]
|
||||
|
||||
# Example - This runs evaluation on BrowsingAgent for 125 instances on miniwob, with 2 workers running in parallel
|
||||
export ALLHANDS_API_KEY="YOUR-API-KEY"
|
||||
export RUNTIME=remote
|
||||
export SANDBOX_REMOTE_RUNTIME_API_URL="https://runtime.eval.all-hands.dev"
|
||||
./evaluation/miniwob/scripts/run_infer.sh llm.eval HEAD BrowsingAgent "" 125 2
|
||||
```
|
||||
|
||||
Results will be in `evaluation/evaluation_outputs/outputs/miniwob/`
|
||||
|
||||
To calculate the average reward, run:
|
||||
|
||||
@@ -23,7 +23,7 @@ if __name__ == '__main__':
|
||||
data = json.loads(line)
|
||||
actual_num += 1
|
||||
total_cost += data['metrics']['accumulated_cost']
|
||||
total_reward += data['test_result']
|
||||
total_reward += data['test_result']['reward']
|
||||
|
||||
avg_reward = total_reward / total_num
|
||||
print('Avg Reward: ', avg_reward)
|
||||
|
||||
@@ -47,6 +47,7 @@ SUPPORTED_AGENT_CLS = {'BrowsingAgent', 'CodeActAgent'}
|
||||
|
||||
AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
|
||||
'CodeActAgent': codeact_user_response,
|
||||
'BrowsingAgent': 'Continue the task. IMPORTANT: do not talk to the user until you have finished the task',
|
||||
}
|
||||
|
||||
|
||||
@@ -66,6 +67,7 @@ def get_config(
|
||||
browsergym_eval_env=env_id,
|
||||
api_key=os.environ.get('ALLHANDS_API_KEY', None),
|
||||
remote_runtime_api_url=os.environ.get('SANDBOX_REMOTE_RUNTIME_API_URL'),
|
||||
remote_runtime_init_timeout=1800,
|
||||
keep_runtime_alive=False,
|
||||
timeout=120,
|
||||
remote_runtime_init_timeout=1800,
|
||||
|
||||
@@ -33,7 +33,7 @@ echo "MODEL_CONFIG: $MODEL_CONFIG"
|
||||
|
||||
EVAL_NOTE="${AGENT_VERSION}_${NOTE}"
|
||||
|
||||
COMMAND="poetry run python evaluation/miniwob/run_infer.py \
|
||||
COMMAND="export PYTHONPATH=evaluation/miniwob:\$PYTHONPATH && poetry run python evaluation/miniwob/run_infer.py \
|
||||
--agent-cls $AGENT \
|
||||
--llm-config $MODEL_CONFIG \
|
||||
--max-iterations 10 \
|
||||
|
||||
Reference in New Issue
Block a user