APP-190 Add browser screenshot support for V1 conversations (#11919)

Co-authored-by: openhands <openhands@all-hands.dev>
Co-authored-by: sp.wack <83104063+amanape@users.noreply.github.com>
This commit is contained in:
Tim O'Farrell 2025-12-06 14:04:01 -07:00 committed by GitHub
parent d7b36c9579
commit 1a983d2978
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 338 additions and 29 deletions

View File

@ -1,12 +1,26 @@
import { describe, it, expect, beforeAll, afterAll, afterEach } from "vitest";
import {
describe,
it,
expect,
beforeAll,
beforeEach,
afterAll,
afterEach,
} from "vitest";
import { screen, waitFor, render, cleanup } from "@testing-library/react";
import { QueryClient, QueryClientProvider } from "@tanstack/react-query";
import { http, HttpResponse } from "msw";
import { useOptimisticUserMessageStore } from "#/stores/optimistic-user-message-store";
import { useBrowserStore } from "#/stores/browser-store";
import { useCommandStore } from "#/state/command-store";
import {
createMockMessageEvent,
createMockUserMessageEvent,
createMockAgentErrorEvent,
createMockBrowserObservationEvent,
createMockBrowserNavigateActionEvent,
createMockExecuteBashActionEvent,
createMockExecuteBashObservationEvent,
} from "#/mocks/mock-ws-helpers";
import {
ConnectionStatusComponent,
@ -461,7 +475,7 @@ describe("Conversation WebSocket Handler", () => {
);
// Create a test component that displays loading state
const HistoryLoadingComponent = () => {
function HistoryLoadingComponent() {
const context = useConversationWebSocket();
const { events } = useEventStore();
@ -474,7 +488,7 @@ describe("Conversation WebSocket Handler", () => {
<div data-testid="expected-event-count">{expectedEventCount}</div>
</div>
);
};
}
// Render with WebSocket context
renderWithWebSocketContext(
@ -484,7 +498,9 @@ describe("Conversation WebSocket Handler", () => {
);
// Initially should be loading history
expect(screen.getByTestId("is-loading-history")).toHaveTextContent("true");
expect(screen.getByTestId("is-loading-history")).toHaveTextContent(
"true",
);
// Wait for all events to be received
await waitFor(() => {
@ -523,7 +539,7 @@ describe("Conversation WebSocket Handler", () => {
);
// Create a test component that displays loading state
const HistoryLoadingComponent = () => {
function HistoryLoadingComponent() {
const context = useConversationWebSocket();
return (
@ -533,7 +549,7 @@ describe("Conversation WebSocket Handler", () => {
</div>
</div>
);
};
}
// Render with WebSocket context
renderWithWebSocketContext(
@ -583,7 +599,7 @@ describe("Conversation WebSocket Handler", () => {
);
// Create a test component that displays loading state
const HistoryLoadingComponent = () => {
function HistoryLoadingComponent() {
const context = useConversationWebSocket();
const { events } = useEventStore();
@ -595,7 +611,7 @@ describe("Conversation WebSocket Handler", () => {
<div data-testid="events-received">{events.length}</div>
</div>
);
};
}
// Render with WebSocket context
renderWithWebSocketContext(
@ -605,7 +621,9 @@ describe("Conversation WebSocket Handler", () => {
);
// Initially should be loading history
expect(screen.getByTestId("is-loading-history")).toHaveTextContent("true");
expect(screen.getByTestId("is-loading-history")).toHaveTextContent(
"true",
);
// Wait for all events to be received
await waitFor(() => {
@ -621,17 +639,133 @@ describe("Conversation WebSocket Handler", () => {
});
});
// 9. Terminal I/O Tests (ExecuteBashAction and ExecuteBashObservation)
describe("Terminal I/O Integration", () => {
it("should append command to store when ExecuteBashAction event is received", async () => {
const { createMockExecuteBashActionEvent } = await import(
"#/mocks/mock-ws-helpers"
// 9. Browser State Tests (BrowserObservation)
describe("Browser State Integration", () => {
beforeEach(() => {
useBrowserStore.getState().reset();
});
it("should update browser store with screenshot when BrowserObservation event is received", async () => {
// Create a mock BrowserObservation event with screenshot data
const mockBrowserObsEvent = createMockBrowserObservationEvent(
"base64-screenshot-data",
"Page loaded successfully",
);
const { useCommandStore } = await import("#/state/command-store");
// Clear the command store before test
// Set up MSW to send the event when connection is established
mswServer.use(
wsLink.addEventListener("connection", ({ client, server }) => {
server.connect();
// Send the mock event after connection
client.send(JSON.stringify(mockBrowserObsEvent));
}),
);
// Render with WebSocket context
renderWithWebSocketContext(<ConnectionStatusComponent />);
// Wait for connection
await waitFor(() => {
expect(screen.getByTestId("connection-state")).toHaveTextContent(
"OPEN",
);
});
// Wait for the browser store to be updated with screenshot
await waitFor(() => {
const { screenshotSrc } = useBrowserStore.getState();
expect(screenshotSrc).toBe(
"data:image/png;base64,base64-screenshot-data",
);
});
});
it("should update browser store with URL when BrowserNavigateAction followed by BrowserObservation", async () => {
// Create mock events - action first, then observation
const mockBrowserActionEvent = createMockBrowserNavigateActionEvent(
"https://example.com/test-page",
);
const mockBrowserObsEvent = createMockBrowserObservationEvent(
"base64-screenshot-data",
"Page loaded successfully",
);
// Set up MSW to send both events when connection is established
mswServer.use(
wsLink.addEventListener("connection", ({ client, server }) => {
server.connect();
// Send action first, then observation
client.send(JSON.stringify(mockBrowserActionEvent));
client.send(JSON.stringify(mockBrowserObsEvent));
}),
);
// Render with WebSocket context
renderWithWebSocketContext(<ConnectionStatusComponent />);
// Wait for connection
await waitFor(() => {
expect(screen.getByTestId("connection-state")).toHaveTextContent(
"OPEN",
);
});
// Wait for the browser store to be updated with both screenshot and URL
await waitFor(() => {
const { screenshotSrc, url } = useBrowserStore.getState();
expect(screenshotSrc).toBe(
"data:image/png;base64,base64-screenshot-data",
);
expect(url).toBe("https://example.com/test-page");
});
});
it("should not update browser store when BrowserObservation has no screenshot data", async () => {
const initialScreenshot = useBrowserStore.getState().screenshotSrc;
// Create a mock BrowserObservation event WITHOUT screenshot data
const mockBrowserObsEvent = createMockBrowserObservationEvent(
null, // no screenshot
"Browser action completed",
);
// Set up MSW to send the event when connection is established
mswServer.use(
wsLink.addEventListener("connection", ({ client, server }) => {
server.connect();
// Send the mock event after connection
client.send(JSON.stringify(mockBrowserObsEvent));
}),
);
// Render with WebSocket context
renderWithWebSocketContext(<ConnectionStatusComponent />);
// Wait for connection
await waitFor(() => {
expect(screen.getByTestId("connection-state")).toHaveTextContent(
"OPEN",
);
});
// Give some time for any potential updates
await new Promise((resolve) => {
setTimeout(resolve, 100);
});
// Screenshot should remain unchanged (empty/initial value)
const { screenshotSrc } = useBrowserStore.getState();
expect(screenshotSrc).toBe(initialScreenshot);
});
});
// 10. Terminal I/O Tests (ExecuteBashAction and ExecuteBashObservation)
describe("Terminal I/O Integration", () => {
beforeEach(() => {
useCommandStore.getState().clearTerminal();
});
it("should append command to store when ExecuteBashAction event is received", async () => {
// Create a mock ExecuteBashAction event
const mockBashActionEvent = createMockExecuteBashActionEvent("npm test");
@ -667,14 +801,6 @@ describe("Conversation WebSocket Handler", () => {
});
it("should append output to store when ExecuteBashObservation event is received", async () => {
const { createMockExecuteBashObservationEvent } = await import(
"#/mocks/mock-ws-helpers"
);
const { useCommandStore } = await import("#/state/command-store");
// Clear the command store before test
useCommandStore.getState().clearTerminal();
// Create a mock ExecuteBashObservation event
const mockBashObservationEvent = createMockExecuteBashObservationEvent(
"PASS tests/example.test.js\n ✓ should work (2 ms)",

View File

@ -0,0 +1,92 @@
import { describe, it, expect } from "vitest";
import { getObservationContent } from "../get-observation-content";
import { ObservationEvent } from "#/types/v1/core";
import { BrowserObservation } from "#/types/v1/core/base/observation";
describe("getObservationContent - BrowserObservation", () => {
it("should return output content when available", () => {
const mockEvent: ObservationEvent<BrowserObservation> = {
id: "test-id",
timestamp: "2024-01-01T00:00:00Z",
source: "environment",
tool_name: "browser_navigate",
tool_call_id: "call-id",
action_id: "action-id",
observation: {
kind: "BrowserObservation",
output: "Browser action completed",
error: null,
screenshot_data: "base64data",
},
};
const result = getObservationContent(mockEvent);
expect(result).toContain("**Output:**");
expect(result).toContain("Browser action completed");
});
it("should handle error cases properly", () => {
const mockEvent: ObservationEvent<BrowserObservation> = {
id: "test-id",
timestamp: "2024-01-01T00:00:00Z",
source: "environment",
tool_name: "browser_navigate",
tool_call_id: "call-id",
action_id: "action-id",
observation: {
kind: "BrowserObservation",
output: "",
error: "Browser action failed",
screenshot_data: null,
},
};
const result = getObservationContent(mockEvent);
expect(result).toContain("**Error:**");
expect(result).toContain("Browser action failed");
});
it("should provide default message when no output or error", () => {
const mockEvent: ObservationEvent<BrowserObservation> = {
id: "test-id",
timestamp: "2024-01-01T00:00:00Z",
source: "environment",
tool_name: "browser_navigate",
tool_call_id: "call-id",
action_id: "action-id",
observation: {
kind: "BrowserObservation",
output: "",
error: null,
screenshot_data: "base64data",
},
};
const result = getObservationContent(mockEvent);
expect(result).toBe("Browser action completed successfully.");
});
it("should return output when screenshot_data is null", () => {
const mockEvent: ObservationEvent<BrowserObservation> = {
id: "test-id",
timestamp: "2024-01-01T00:00:00Z",
source: "environment",
tool_name: "browser_navigate",
tool_call_id: "call-id",
action_id: "action-id",
observation: {
kind: "BrowserObservation",
output: "Page loaded successfully",
error: null,
screenshot_data: null,
},
};
const result = getObservationContent(mockEvent);
expect(result).toBe("**Output:**\nPage loaded successfully");
});
});

View File

@ -98,14 +98,16 @@ const getBrowserObservationContent = (
.filter((c) => c.type === "text")
.map((c) => c.text)
.join("\n")
: "";
: observation.output || "";
let contentDetails = "";
if ("is_error" in observation && observation.is_error) {
contentDetails += `**Error:**\n${textContent}`;
} else {
if (observation.error) {
contentDetails += `**Error:**\n${observation.error}`;
} else if (textContent) {
contentDetails += `**Output:**\n${textContent}`;
} else {
contentDetails += "Browser action completed successfully.";
}
if (contentDetails.length > MAX_CONTENT_LENGTH) {

View File

@ -118,7 +118,7 @@ const renderUserMessageWithSkillReady = (
);
} catch (error) {
// If skill ready event creation fails, just render the user message
console.error("Failed to create skill ready event:", error);
// Failed to create skill ready event, fallback to user message
return (
<UserAssistantEventMessage
event={messageEvent}

View File

@ -14,6 +14,7 @@ import { useErrorMessageStore } from "#/stores/error-message-store";
import { useOptimisticUserMessageStore } from "#/stores/optimistic-user-message-store";
import { useV1ConversationStateStore } from "#/stores/v1-conversation-state-store";
import { useCommandStore } from "#/state/command-store";
import { useBrowserStore } from "#/stores/browser-store";
import {
isV1Event,
isAgentErrorEvent,
@ -27,6 +28,8 @@ import {
isExecuteBashObservationEvent,
isConversationErrorEvent,
isPlanningFileEditorObservationEvent,
isBrowserObservationEvent,
isBrowserNavigateActionEvent,
} from "#/types/v1/type-guards";
import { ConversationStateUpdateEventStats } from "#/types/v1/core/events/conversation-state-event";
import { handleActionEventCacheInvalidation } from "#/utils/cache-utils";
@ -383,6 +386,22 @@ export function ConversationWebSocketProvider({
.join("\n");
appendOutput(textContent);
}
// Handle BrowserObservation events - update browser store with screenshot
if (isBrowserObservationEvent(event)) {
const { screenshot_data: screenshotData } = event.observation;
if (screenshotData) {
const screenshotSrc = screenshotData.startsWith("data:")
? screenshotData
: `data:image/png;base64,${screenshotData}`;
useBrowserStore.getState().setScreenshotSrc(screenshotSrc);
}
}
// Handle BrowserNavigateAction events - update browser store with URL
if (isBrowserNavigateActionEvent(event)) {
useBrowserStore.getState().setUrl(event.action.url);
}
}
} catch (error) {
// eslint-disable-next-line no-console

View File

@ -184,3 +184,55 @@ export const createMockExecuteBashObservationEvent = (
},
action_id: "bash-action-123",
});
/**
* Creates a mock BrowserObservation event for testing browser state handling
*/
export const createMockBrowserObservationEvent = (
screenshotData: string | null = "base64-screenshot-data",
output: string = "Browser action completed",
error: string | null = null,
) => ({
id: "browser-obs-123",
timestamp: new Date().toISOString(),
source: "environment",
tool_name: "browser_navigate",
tool_call_id: "browser-call-456",
observation: {
kind: "BrowserObservation",
output,
error,
screenshot_data: screenshotData,
},
action_id: "browser-action-123",
});
/**
* Creates a mock BrowserNavigateAction event for testing browser URL extraction
*/
export const createMockBrowserNavigateActionEvent = (
url: string = "https://example.com",
) => ({
id: "browser-action-123",
timestamp: new Date().toISOString(),
source: "agent",
thought: [{ type: "text", text: "Navigating to URL" }],
thinking_blocks: [],
action: {
kind: "BrowserNavigateAction",
url,
new_tab: false,
},
tool_name: "browser_navigate",
tool_call_id: "browser-call-456",
tool_call: {
id: "browser-call-456",
type: "function",
function: {
name: "browser_navigate",
arguments: JSON.stringify({ url, new_tab: false }),
},
},
llm_response_id: "llm-response-789",
security_risk: { level: "low" },
});

View File

@ -7,6 +7,8 @@ import {
ExecuteBashObservation,
PlanningFileEditorObservation,
TerminalObservation,
BrowserObservation,
BrowserNavigateAction,
} from "./core";
import { AgentErrorEvent } from "./core/events/observation-event";
import { MessageEvent } from "./core/events/message-event";
@ -126,6 +128,22 @@ export const isPlanningFileEditorObservationEvent = (
isObservationEvent(event) &&
event.observation.kind === "PlanningFileEditorObservation";
/**
* Type guard function to check if an observation event is a BrowserObservation
*/
export const isBrowserObservationEvent = (
event: OpenHandsEvent,
): event is ObservationEvent<BrowserObservation> =>
isObservationEvent(event) && event.observation.kind === "BrowserObservation";
/**
* Type guard function to check if an action event is a BrowserNavigateAction
*/
export const isBrowserNavigateActionEvent = (
event: OpenHandsEvent,
): event is ActionEvent<BrowserNavigateAction> =>
isActionEvent(event) && event.action.kind === "BrowserNavigateAction";
/**
* Type guard function to check if an event is a system prompt event
*/