Files
autoMate/gradio_ui/loop.py
2025-03-11 19:09:40 +08:00

80 lines
2.8 KiB
Python

"""
Agentic sampling loop that calls the Anthropic API and local implenmentation of anthropic-defined computer use tools.
"""
import base64
from collections.abc import Callable
from gradio_ui.agent.vision_agent import VisionAgent
from gradio_ui.tools.screen_capture import get_screenshot
from anthropic import APIResponse
from anthropic.types.beta import (
BetaContentBlock,
BetaMessage,
BetaMessageParam
)
from gradio_ui.agent.task_plan_agent import TaskPlanAgent
from gradio_ui.agent.task_run_agent import TaskRunAgent
from gradio_ui.tools import ToolResult
from gradio_ui.agent.llm_utils.utils import encode_image
from gradio_ui.agent.llm_utils.omniparserclient import OmniParserClient
from gradio_ui.agent.vlm_agent import VLMAgent
from gradio_ui.executor.anthropic_executor import AnthropicExecutor
from pathlib import Path
OUTPUT_DIR = "./tmp/outputs"
def sampling_loop_sync(
*,
model: str,
messages: list[BetaMessageParam],
output_callback: Callable[[BetaContentBlock], None],
tool_output_callback: Callable[[ToolResult, str], None],
api_response_callback: Callable[[APIResponse[BetaMessage]], None],
api_key: str,
only_n_most_recent_images: int | None = 0,
max_tokens: int = 4096,
omniparser_url: str,
base_url: str,
vision_agent: VisionAgent
):
"""
Synchronous agentic sampling loop for the assistant/tool interaction of computer use.
"""
print('in sampling_loop_sync, model:', model)
# omniparser_client = OmniParserClient(url=f"http://{omniparser_url}/parse/")
task_plan_agent = TaskPlanAgent()
# actor = VLMAgent(
# model=model,
# api_key=api_key,
# base_url=base_url,
# api_response_callback=api_response_callback,
# output_callback=output_callback,
# max_tokens=max_tokens,
# only_n_most_recent_images=only_n_most_recent_images
# )
executor = AnthropicExecutor(
output_callback=output_callback,
tool_output_callback=tool_output_callback,
)
tool_result_content = None
print(f"Start the message loop. User messages: {messages}")
plan = task_plan_agent(user_task = messages[-1]["content"][0].text)
task_run_agent = TaskRunAgent()
while True:
parsed_screen = parse_screen(vision_agent)
# tools_use_needed, vlm_response_json = actor(messages=messages, parsed_screen=parsed_screen)
tools_use_needed = task_run_agent(plan, parsed_screen)
for message, tool_result_content in executor(tools_use_needed, messages):
yield message
if not tool_result_content:
return messages
def parse_screen(vision_agent: VisionAgent):
_, screenshot_path = get_screenshot()
screenshot_path = str(screenshot_path)
image_base64 = encode_image(screenshot_path)
parsed_screen = vision_agent(image_base64)
return parsed_screen