优化代码逻辑

This commit is contained in:
yuruo 2025-03-06 18:17:22 +08:00
parent 7d500a3b0e
commit dbea699336
3 changed files with 28 additions and 97 deletions

View File

@ -198,10 +198,10 @@ class VLMAgent:
You are using a Windows device.
You are able to use a mouse and keyboard to interact with the computer based on the given task and screenshot.
You can only interact with the desktop GUI (no terminal or application menu access).
You may be given some history plan and actions, this is the response from the previous loop.
You should carefully consider your plan base on the task, screenshot, and history actions.
Here is the list of all detected bounding boxes by IDs on the screen and their description:{screen_info}
Your available "Next Action" only include:

View File

@ -159,6 +159,7 @@ def chatbot_output_callback(message, chatbot_state, hide_images=False, sender="b
def process_input(user_input, state):
# Reset the stop flag
if state["stop"]:
state["stop"] = False

View File

@ -9,6 +9,7 @@ from .base import BaseAnthropicTool, ToolError, ToolResult
from .screen_capture import get_screenshot
import requests
import re
import pyautogui
OUTPUT_DIR = "./tmp/outputs"
TYPING_DELAY_MS = 12
@ -40,11 +41,6 @@ MAX_SCALING_TARGETS: dict[str, Resolution] = {
"FWXGA": Resolution(width=1366, height=768), # ~16:9
}
class ScalingSource(StrEnum):
COMPUTER = "computer"
API = "api"
class ComputerToolOptions(TypedDict):
display_height_px: int
display_width_px: int
@ -65,17 +61,13 @@ class ComputerTool(BaseAnthropicTool):
height: int
display_num: int | None
_screenshot_delay = 2.0
_scaling_enabled = True
@property
def options(self) -> ComputerToolOptions:
width, height = self.scale_coordinates(
ScalingSource.COMPUTER, self.width, self.height
)
# 直接使用原始尺寸,不进行缩放
return {
"display_width_px": width,
"display_height_px": height,
"display_width_px": self.width,
"display_height_px": self.height,
"display_number": self.display_num,
}
@ -89,7 +81,6 @@ class ComputerTool(BaseAnthropicTool):
self.display_num = None
self.offset_x = 0
self.offset_y = 0
self.is_scaling = is_scaling
self.width, self.height = self.get_screen_size()
print(f"screen size: {self.width}, {self.height}")
self.key_conversion = {"Page_Down": "pagedown",
@ -104,7 +95,7 @@ class ComputerTool(BaseAnthropicTool):
coordinate: tuple[int, int] | None = None,
**kwargs,
):
print(f"action: {action}, text: {text}, coordinate: {coordinate}, is_scaling: {self.is_scaling}")
print(f"action: {action}, text: {text}, coordinate: {coordinate},")
if action in ("mouse_move", "left_click_drag"):
if coordinate is None:
raise ToolError(f"coordinate is required for {action}")
@ -115,23 +106,15 @@ class ComputerTool(BaseAnthropicTool):
# if not all(isinstance(i, int) and i >= 0 for i in coordinate):
if not all(isinstance(i, int) for i in coordinate):
raise ToolError(f"{coordinate} must be a tuple of non-negative ints")
if self.is_scaling:
x, y = self.scale_coordinates(
ScalingSource.API, coordinate[0], coordinate[1]
)
else:
x, y = coordinate
# print(f"scaled_coordinates: {x}, {y}")
# print(f"offset: {self.offset_x}, {self.offset_y}")
# x += self.offset_x # TODO - check if this is needed
# y += self.offset_y
x, y = coordinate
print(f"mouse move to {x}, {y}")
if action == "mouse_move":
self.run_command(f"pyautogui.moveTo({x}, {y})")
pyautogui.moveTo(x, y)
return ToolResult(output=f"Moved mouse to ({x}, {y})")
elif action == "left_click_drag":
current_x, current_y = self.run_command("pyautogui.position()")
self.run_command(f"pyautogui.dragTo({x}, {y}, duration=0.5)")
current_x, current_y = pyautogui.position()
pyautogui.dragTo(x, y, duration=0.5)
return ToolResult(output=f"Dragged mouse from ({current_x}, {current_y}) to ({x}, {y})")
if action in ("key", "type"):
if text is None:
@ -146,17 +129,17 @@ class ComputerTool(BaseAnthropicTool):
for key in keys:
key = self.key_conversion.get(key.strip(), key.strip())
key = key.lower()
self.run_command(f"pyautogui.keyDown('{key}')") # Press down each key
pyautogui.keyDown(key)
for key in reversed(keys):
key = self.key_conversion.get(key.strip(), key.strip())
key = key.lower()
self.run_command(f"pyautogui.keyUp('{key}')") # Release each key in reverse order
pyautogui.keyUp(key)
return ToolResult(output=f"Pressed keys: {text}")
elif action == "type":
# default click before type TODO: check if this is needed
self.run_command("pyautogui.click()")
self.run_command(f"pyautogui.typewrite('{text}', interval={TYPING_DELAY_MS / 1000})")
self.run_command("pyautogui.press('enter')")
pyautogui.click()
pyautogui.typewrite(text, interval=TYPING_DELAY_MS / 1000)
pyautogui.press('enter')
screenshot_base64 = (await self.screenshot()).base64_image
return ToolResult(output=text, base64_image=screenshot_base64)
if action in (
@ -175,28 +158,28 @@ class ComputerTool(BaseAnthropicTool):
if action == "screenshot":
return await self.screenshot()
elif action == "cursor_position":
x, y = self.run_command("pyautogui.position()")
x, y = self.scale_coordinates(ScalingSource.COMPUTER, x, y)
x, y = pyautogui.position()
# 直接返回原始坐标,不进行缩放
return ToolResult(output=f"X={x},Y={y}")
else:
if action == "left_click":
self.run_command("pyautogui.click()")
pyautogui.click()
elif action == "right_click":
self.run_command("pyautogui.rightClick()")
pyautogui.rightClick()
elif action == "middle_click":
self.run_command("pyautogui.middleClick()")
pyautogui.middleClick()
elif action == "double_click":
self.run_command("pyautogui.doubleClick()")
pyautogui.doubleClick()
elif action == "left_press":
self.run_command("pyautogui.mouseDown()")
pyautogui.mouseDown()
time.sleep(1)
self.run_command("pyautogui.mouseUp()")
pyautogui.mouseUp()
return ToolResult(output=f"Performed {action}")
if action in ("scroll_up", "scroll_down"):
if action == "scroll_up":
self.run_command("pyautogui.scroll(100)")
pyautogui.scroll(100)
elif action == "scroll_down":
self.run_command("pyautogui.scroll(-100)")
pyautogui.scroll(-100)
return ToolResult(output=f"Performed {action}")
if action == "hover":
return ToolResult(output=f"Performed {action}")
@ -204,31 +187,6 @@ class ComputerTool(BaseAnthropicTool):
time.sleep(1)
return ToolResult(output=f"Performed {action}")
raise ToolError(f"Invalid action: {action}")
def run_command(self, action: str):
"""
Executes a python command on the server. Only return tuple of x,y when action is "pyautogui.position()"
"""
prefix = "import pyautogui; pyautogui.FAILSAFE = False;"
command_list = ["python", "-c", f"{prefix} {action}"]
parse = action == "pyautogui.position()"
if parse:
command_list[-1] = f"{prefix} print({action})"
try:
print(f"run command: {command_list}")
# 使用 tool.execute_command 替代 requests.post
response = tool.execute_command(command_list)
time.sleep(0.7) # avoid async error as actions take time to complete
print(f"action executed")
if parse:
output = response['output'].strip()
match = re.search(r'Point\(x=(\d+),\s*y=(\d+)\)', output)
if not match:
raise ToolError(f"Could not parse coordinates from output: {output}")
x, y = map(int, match.groups())
return x, y
except requests.exceptions.RequestException as e:
raise ToolError(f"An error occurred while trying to execute the command: {str(e)}")
async def screenshot(self):
if not hasattr(self, 'target_dimension'):
@ -249,35 +207,7 @@ class ComputerTool(BaseAnthropicTool):
padding_image.paste(screenshot, (0, 0))
return padding_image
def scale_coordinates(self, source: ScalingSource, x: int, y: int):
"""Scale coordinates to a target maximum resolution."""
if not self._scaling_enabled:
return x, y
ratio = self.width / self.height
target_dimension = None
for target_name, dimension in MAX_SCALING_TARGETS.items():
# allow some error in the aspect ratio - not ratios are exactly 16:9
if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
if dimension["width"] < self.width:
target_dimension = dimension
self.target_dimension = target_dimension
# print(f"target_dimension: {target_dimension}")
break
if target_dimension is None:
# TODO: currently we force the target to be WXGA (16:10), when it cannot find a match
target_dimension = MAX_SCALING_TARGETS["WXGA"]
self.target_dimension = MAX_SCALING_TARGETS["WXGA"]
# should be less than 1
x_scaling_factor = target_dimension["width"] / self.width
y_scaling_factor = target_dimension["height"] / self.height
if source == ScalingSource.API:
if x > self.width or y > self.height:
raise ToolError(f"Coordinates {x}, {y} are out of bounds")
# scale up
return round(x / x_scaling_factor), round(y / y_scaling_factor)
# scale down
return round(x * x_scaling_factor), round(y * y_scaling_factor)
def get_screen_size(self):
"""Return width and height of the screen"""
try: