Restore previous browsing agent behavior when evaluating on WebArena and miniwob++ only (#2341)

* restore eval mode * fix
2025-12-26 05:48:36 +08:00 · 2024-06-09 04:10:02 -04:00 · 2024-06-09 04:10:02 -04:00 · bd00f0f049
commit bd00f0f049
parent fab8c9003b
1 changed files with 14 additions and 0 deletions
--- a/agenthub/browsing_agent/browsing_agent.py
+++ b/agenthub/browsing_agent/browsing_agent.py
@ -28,6 +28,11 @@ USE_CONCISE_ANSWER = (
    os.environ.get('USE_CONCISE_ANSWER', 'false') == 'true'
 )  # only return concise answer when running webarena and miniwob benchmarks

+if not USE_NAV and USE_CONCISE_ANSWER:
+    EVAL_MODE = True  # disabled NAV actions and only return concise answer, for webarena and miniwob benchmarks\
+else:
+    EVAL_MODE = False
+

 class BrowsingAgent(Agent):
    VERSION = '1.0'
@ -118,6 +123,12 @@ class BrowsingAgent(Agent):
        last_obs = None
        last_action = None

+        if EVAL_MODE and len(state.history) == 1:
+            # for webarena and miniwob++ eval, we need to retrieve the initial observation already in browser env
+            # initialize and retrieve the first observation by issuing an noop OP
+            # For non-benchmark browsing, the browser env starts with a blank page, and the agent is expected to first navigate to desired websites
+            return BrowseInteractiveAction(browser_actions='noop()')
+
        for prev_action, obs in state.history:
            if isinstance(prev_action, BrowseInteractiveAction):
                prev_actions.append(prev_action.browser_actions)
@ -130,6 +141,9 @@ class BrowsingAgent(Agent):
                # agent has responded, task finish.
                return AgentFinishAction(outputs={'content': prev_action.content})

+        if EVAL_MODE:
+            prev_actions = prev_actions[1:]  # remove the first noop action
+
        prev_action_str = '\n'.join(prev_actions)
        # if the final BrowserInteractiveAction exec BrowserGym's send_msg_to_user,
        # we should also send a message back to the user in OpenDevin and call it a day