From 1a983d29781149187fb46a437da8e26dc02ac983 Mon Sep 17 00:00:00 2001 From: Tim O'Farrell Date: Sat, 6 Dec 2025 14:04:01 -0700 Subject: [PATCH] APP-190 Add browser screenshot support for V1 conversations (#11919) Co-authored-by: openhands Co-authored-by: sp.wack <83104063+amanape@users.noreply.github.com> --- .../conversation-websocket-handler.test.tsx | 174 +++++++++++++++--- .../__tests__/get-observation-content.test.ts | 92 +++++++++ .../get-observation-content.ts | 10 +- .../src/components/v1/chat/event-message.tsx | 2 +- .../conversation-websocket-context.tsx | 19 ++ frontend/src/mocks/mock-ws-helpers.ts | 52 ++++++ frontend/src/types/v1/type-guards.ts | 18 ++ 7 files changed, 338 insertions(+), 29 deletions(-) create mode 100644 frontend/src/components/v1/chat/event-content-helpers/__tests__/get-observation-content.test.ts diff --git a/frontend/__tests__/conversation-websocket-handler.test.tsx b/frontend/__tests__/conversation-websocket-handler.test.tsx index f7d67d82b5..f922a8876c 100644 --- a/frontend/__tests__/conversation-websocket-handler.test.tsx +++ b/frontend/__tests__/conversation-websocket-handler.test.tsx @@ -1,12 +1,26 @@ -import { describe, it, expect, beforeAll, afterAll, afterEach } from "vitest"; +import { + describe, + it, + expect, + beforeAll, + beforeEach, + afterAll, + afterEach, +} from "vitest"; import { screen, waitFor, render, cleanup } from "@testing-library/react"; import { QueryClient, QueryClientProvider } from "@tanstack/react-query"; import { http, HttpResponse } from "msw"; import { useOptimisticUserMessageStore } from "#/stores/optimistic-user-message-store"; +import { useBrowserStore } from "#/stores/browser-store"; +import { useCommandStore } from "#/state/command-store"; import { createMockMessageEvent, createMockUserMessageEvent, createMockAgentErrorEvent, + createMockBrowserObservationEvent, + createMockBrowserNavigateActionEvent, + createMockExecuteBashActionEvent, + createMockExecuteBashObservationEvent, } from "#/mocks/mock-ws-helpers"; import { ConnectionStatusComponent, @@ -461,7 +475,7 @@ describe("Conversation WebSocket Handler", () => { ); // Create a test component that displays loading state - const HistoryLoadingComponent = () => { + function HistoryLoadingComponent() { const context = useConversationWebSocket(); const { events } = useEventStore(); @@ -474,7 +488,7 @@ describe("Conversation WebSocket Handler", () => {
{expectedEventCount}
); - }; + } // Render with WebSocket context renderWithWebSocketContext( @@ -484,7 +498,9 @@ describe("Conversation WebSocket Handler", () => { ); // Initially should be loading history - expect(screen.getByTestId("is-loading-history")).toHaveTextContent("true"); + expect(screen.getByTestId("is-loading-history")).toHaveTextContent( + "true", + ); // Wait for all events to be received await waitFor(() => { @@ -523,7 +539,7 @@ describe("Conversation WebSocket Handler", () => { ); // Create a test component that displays loading state - const HistoryLoadingComponent = () => { + function HistoryLoadingComponent() { const context = useConversationWebSocket(); return ( @@ -533,7 +549,7 @@ describe("Conversation WebSocket Handler", () => { ); - }; + } // Render with WebSocket context renderWithWebSocketContext( @@ -583,7 +599,7 @@ describe("Conversation WebSocket Handler", () => { ); // Create a test component that displays loading state - const HistoryLoadingComponent = () => { + function HistoryLoadingComponent() { const context = useConversationWebSocket(); const { events } = useEventStore(); @@ -595,7 +611,7 @@ describe("Conversation WebSocket Handler", () => {
{events.length}
); - }; + } // Render with WebSocket context renderWithWebSocketContext( @@ -605,7 +621,9 @@ describe("Conversation WebSocket Handler", () => { ); // Initially should be loading history - expect(screen.getByTestId("is-loading-history")).toHaveTextContent("true"); + expect(screen.getByTestId("is-loading-history")).toHaveTextContent( + "true", + ); // Wait for all events to be received await waitFor(() => { @@ -621,17 +639,133 @@ describe("Conversation WebSocket Handler", () => { }); }); - // 9. Terminal I/O Tests (ExecuteBashAction and ExecuteBashObservation) - describe("Terminal I/O Integration", () => { - it("should append command to store when ExecuteBashAction event is received", async () => { - const { createMockExecuteBashActionEvent } = await import( - "#/mocks/mock-ws-helpers" + // 9. Browser State Tests (BrowserObservation) + describe("Browser State Integration", () => { + beforeEach(() => { + useBrowserStore.getState().reset(); + }); + + it("should update browser store with screenshot when BrowserObservation event is received", async () => { + // Create a mock BrowserObservation event with screenshot data + const mockBrowserObsEvent = createMockBrowserObservationEvent( + "base64-screenshot-data", + "Page loaded successfully", ); - const { useCommandStore } = await import("#/state/command-store"); - // Clear the command store before test + // Set up MSW to send the event when connection is established + mswServer.use( + wsLink.addEventListener("connection", ({ client, server }) => { + server.connect(); + // Send the mock event after connection + client.send(JSON.stringify(mockBrowserObsEvent)); + }), + ); + + // Render with WebSocket context + renderWithWebSocketContext(); + + // Wait for connection + await waitFor(() => { + expect(screen.getByTestId("connection-state")).toHaveTextContent( + "OPEN", + ); + }); + + // Wait for the browser store to be updated with screenshot + await waitFor(() => { + const { screenshotSrc } = useBrowserStore.getState(); + expect(screenshotSrc).toBe( + "data:image/png;base64,base64-screenshot-data", + ); + }); + }); + + it("should update browser store with URL when BrowserNavigateAction followed by BrowserObservation", async () => { + // Create mock events - action first, then observation + const mockBrowserActionEvent = createMockBrowserNavigateActionEvent( + "https://example.com/test-page", + ); + const mockBrowserObsEvent = createMockBrowserObservationEvent( + "base64-screenshot-data", + "Page loaded successfully", + ); + + // Set up MSW to send both events when connection is established + mswServer.use( + wsLink.addEventListener("connection", ({ client, server }) => { + server.connect(); + // Send action first, then observation + client.send(JSON.stringify(mockBrowserActionEvent)); + client.send(JSON.stringify(mockBrowserObsEvent)); + }), + ); + + // Render with WebSocket context + renderWithWebSocketContext(); + + // Wait for connection + await waitFor(() => { + expect(screen.getByTestId("connection-state")).toHaveTextContent( + "OPEN", + ); + }); + + // Wait for the browser store to be updated with both screenshot and URL + await waitFor(() => { + const { screenshotSrc, url } = useBrowserStore.getState(); + expect(screenshotSrc).toBe( + "data:image/png;base64,base64-screenshot-data", + ); + expect(url).toBe("https://example.com/test-page"); + }); + }); + + it("should not update browser store when BrowserObservation has no screenshot data", async () => { + const initialScreenshot = useBrowserStore.getState().screenshotSrc; + + // Create a mock BrowserObservation event WITHOUT screenshot data + const mockBrowserObsEvent = createMockBrowserObservationEvent( + null, // no screenshot + "Browser action completed", + ); + + // Set up MSW to send the event when connection is established + mswServer.use( + wsLink.addEventListener("connection", ({ client, server }) => { + server.connect(); + // Send the mock event after connection + client.send(JSON.stringify(mockBrowserObsEvent)); + }), + ); + + // Render with WebSocket context + renderWithWebSocketContext(); + + // Wait for connection + await waitFor(() => { + expect(screen.getByTestId("connection-state")).toHaveTextContent( + "OPEN", + ); + }); + + // Give some time for any potential updates + await new Promise((resolve) => { + setTimeout(resolve, 100); + }); + + // Screenshot should remain unchanged (empty/initial value) + const { screenshotSrc } = useBrowserStore.getState(); + expect(screenshotSrc).toBe(initialScreenshot); + }); + }); + + // 10. Terminal I/O Tests (ExecuteBashAction and ExecuteBashObservation) + describe("Terminal I/O Integration", () => { + beforeEach(() => { useCommandStore.getState().clearTerminal(); + }); + it("should append command to store when ExecuteBashAction event is received", async () => { // Create a mock ExecuteBashAction event const mockBashActionEvent = createMockExecuteBashActionEvent("npm test"); @@ -667,14 +801,6 @@ describe("Conversation WebSocket Handler", () => { }); it("should append output to store when ExecuteBashObservation event is received", async () => { - const { createMockExecuteBashObservationEvent } = await import( - "#/mocks/mock-ws-helpers" - ); - const { useCommandStore } = await import("#/state/command-store"); - - // Clear the command store before test - useCommandStore.getState().clearTerminal(); - // Create a mock ExecuteBashObservation event const mockBashObservationEvent = createMockExecuteBashObservationEvent( "PASS tests/example.test.js\n ✓ should work (2 ms)", diff --git a/frontend/src/components/v1/chat/event-content-helpers/__tests__/get-observation-content.test.ts b/frontend/src/components/v1/chat/event-content-helpers/__tests__/get-observation-content.test.ts new file mode 100644 index 0000000000..d35dc97925 --- /dev/null +++ b/frontend/src/components/v1/chat/event-content-helpers/__tests__/get-observation-content.test.ts @@ -0,0 +1,92 @@ +import { describe, it, expect } from "vitest"; +import { getObservationContent } from "../get-observation-content"; +import { ObservationEvent } from "#/types/v1/core"; +import { BrowserObservation } from "#/types/v1/core/base/observation"; + +describe("getObservationContent - BrowserObservation", () => { + it("should return output content when available", () => { + const mockEvent: ObservationEvent = { + id: "test-id", + timestamp: "2024-01-01T00:00:00Z", + source: "environment", + tool_name: "browser_navigate", + tool_call_id: "call-id", + action_id: "action-id", + observation: { + kind: "BrowserObservation", + output: "Browser action completed", + error: null, + screenshot_data: "base64data", + }, + }; + + const result = getObservationContent(mockEvent); + + expect(result).toContain("**Output:**"); + expect(result).toContain("Browser action completed"); + }); + + it("should handle error cases properly", () => { + const mockEvent: ObservationEvent = { + id: "test-id", + timestamp: "2024-01-01T00:00:00Z", + source: "environment", + tool_name: "browser_navigate", + tool_call_id: "call-id", + action_id: "action-id", + observation: { + kind: "BrowserObservation", + output: "", + error: "Browser action failed", + screenshot_data: null, + }, + }; + + const result = getObservationContent(mockEvent); + + expect(result).toContain("**Error:**"); + expect(result).toContain("Browser action failed"); + }); + + it("should provide default message when no output or error", () => { + const mockEvent: ObservationEvent = { + id: "test-id", + timestamp: "2024-01-01T00:00:00Z", + source: "environment", + tool_name: "browser_navigate", + tool_call_id: "call-id", + action_id: "action-id", + observation: { + kind: "BrowserObservation", + output: "", + error: null, + screenshot_data: "base64data", + }, + }; + + const result = getObservationContent(mockEvent); + + expect(result).toBe("Browser action completed successfully."); + }); + + it("should return output when screenshot_data is null", () => { + const mockEvent: ObservationEvent = { + id: "test-id", + timestamp: "2024-01-01T00:00:00Z", + source: "environment", + tool_name: "browser_navigate", + tool_call_id: "call-id", + action_id: "action-id", + observation: { + kind: "BrowserObservation", + output: "Page loaded successfully", + error: null, + screenshot_data: null, + }, + }; + + const result = getObservationContent(mockEvent); + + expect(result).toBe("**Output:**\nPage loaded successfully"); + }); +}); diff --git a/frontend/src/components/v1/chat/event-content-helpers/get-observation-content.ts b/frontend/src/components/v1/chat/event-content-helpers/get-observation-content.ts index da2558af42..bf443ea71c 100644 --- a/frontend/src/components/v1/chat/event-content-helpers/get-observation-content.ts +++ b/frontend/src/components/v1/chat/event-content-helpers/get-observation-content.ts @@ -98,14 +98,16 @@ const getBrowserObservationContent = ( .filter((c) => c.type === "text") .map((c) => c.text) .join("\n") - : ""; + : observation.output || ""; let contentDetails = ""; - if ("is_error" in observation && observation.is_error) { - contentDetails += `**Error:**\n${textContent}`; - } else { + if (observation.error) { + contentDetails += `**Error:**\n${observation.error}`; + } else if (textContent) { contentDetails += `**Output:**\n${textContent}`; + } else { + contentDetails += "Browser action completed successfully."; } if (contentDetails.length > MAX_CONTENT_LENGTH) { diff --git a/frontend/src/components/v1/chat/event-message.tsx b/frontend/src/components/v1/chat/event-message.tsx index ce0380c9c8..95690d8984 100644 --- a/frontend/src/components/v1/chat/event-message.tsx +++ b/frontend/src/components/v1/chat/event-message.tsx @@ -118,7 +118,7 @@ const renderUserMessageWithSkillReady = ( ); } catch (error) { // If skill ready event creation fails, just render the user message - console.error("Failed to create skill ready event:", error); + // Failed to create skill ready event, fallback to user message return ( ({ + id: "browser-obs-123", + timestamp: new Date().toISOString(), + source: "environment", + tool_name: "browser_navigate", + tool_call_id: "browser-call-456", + observation: { + kind: "BrowserObservation", + output, + error, + screenshot_data: screenshotData, + }, + action_id: "browser-action-123", +}); + +/** + * Creates a mock BrowserNavigateAction event for testing browser URL extraction + */ +export const createMockBrowserNavigateActionEvent = ( + url: string = "https://example.com", +) => ({ + id: "browser-action-123", + timestamp: new Date().toISOString(), + source: "agent", + thought: [{ type: "text", text: "Navigating to URL" }], + thinking_blocks: [], + action: { + kind: "BrowserNavigateAction", + url, + new_tab: false, + }, + tool_name: "browser_navigate", + tool_call_id: "browser-call-456", + tool_call: { + id: "browser-call-456", + type: "function", + function: { + name: "browser_navigate", + arguments: JSON.stringify({ url, new_tab: false }), + }, + }, + llm_response_id: "llm-response-789", + security_risk: { level: "low" }, +}); diff --git a/frontend/src/types/v1/type-guards.ts b/frontend/src/types/v1/type-guards.ts index 306661e854..ee831ea489 100644 --- a/frontend/src/types/v1/type-guards.ts +++ b/frontend/src/types/v1/type-guards.ts @@ -7,6 +7,8 @@ import { ExecuteBashObservation, PlanningFileEditorObservation, TerminalObservation, + BrowserObservation, + BrowserNavigateAction, } from "./core"; import { AgentErrorEvent } from "./core/events/observation-event"; import { MessageEvent } from "./core/events/message-event"; @@ -126,6 +128,22 @@ export const isPlanningFileEditorObservationEvent = ( isObservationEvent(event) && event.observation.kind === "PlanningFileEditorObservation"; +/** + * Type guard function to check if an observation event is a BrowserObservation + */ +export const isBrowserObservationEvent = ( + event: OpenHandsEvent, +): event is ObservationEvent => + isObservationEvent(event) && event.observation.kind === "BrowserObservation"; + +/** + * Type guard function to check if an action event is a BrowserNavigateAction + */ +export const isBrowserNavigateActionEvent = ( + event: OpenHandsEvent, +): event is ActionEvent => + isActionEvent(event) && event.action.kind === "BrowserNavigateAction"; + /** * Type guard function to check if an event is a system prompt event */