enhance: support model platfroms not limited to openai

This commit is contained in:
Wendong 2025-03-07 19:06:40 +08:00
parent bcb3002e59
commit 08f6dfb550
8 changed files with 155 additions and 182 deletions

View File

@ -35,7 +35,7 @@ class DocumentProcessingToolkit(BaseToolkit):
"""
def __init__(self, cache_dir: Optional[str] = None):
self.image_tool = ImageAnalysisToolkit()
self.audio_tool = AudioAnalysisToolkit()
# self.audio_tool = AudioAnalysisToolkit()
self.excel_tool = ExcelToolkit()
self.cache_dir = "tmp/"
@ -59,9 +59,9 @@ class DocumentProcessingToolkit(BaseToolkit):
res = self.image_tool.ask_question_about_image(document_path, "Please make a detailed caption about the image.")
return True, res
if any(document_path.endswith(ext) for ext in ['.mp3', '.wav']):
res = self.audio_tool.ask_question_about_audio(document_path, "Please transcribe the audio content to text.")
return True, res
# if any(document_path.endswith(ext) for ext in ['.mp3', '.wav']):
# res = self.audio_tool.ask_question_about_audio(document_path, "Please transcribe the audio content to text.")
# return True, res
if any(document_path.endswith(ext) for ext in ['xls', 'xlsx']):
res = self.excel_tool.extract_excel_content(document_path)

View File

@ -15,7 +15,7 @@ import base64
import logging
import json
from PIL import Image
from typing import List, Literal, Tuple
from typing import List, Literal, Tuple, Optional
from urllib.parse import urlparse
from camel.agents import ChatAgent
@ -23,7 +23,7 @@ from camel.configs import ChatGPTConfig
from camel.toolkits.base import BaseToolkit
from camel.toolkits import FunctionTool, CodeExecutionToolkit
from camel.types import ModelType, ModelPlatformType
from camel.models import ModelFactory, OpenAIModel
from camel.models import ModelFactory, OpenAIModel, BaseModelBackend
from camel.messages import BaseMessage
logger = logging.getLogger(__name__)
@ -35,14 +35,8 @@ class ImageAnalysisToolkit(BaseToolkit):
This class provides methods for understanding images, such as identifying
objects, text in images.
"""
def __init__(self, model: Literal['gpt-4o', 'gpt-4o-mini'] = 'gpt-4o'):
self.model_type = ModelType.GPT_4O
if model == 'gpt-4o':
self.model_type = ModelType.GPT_4O
elif model == 'gpt-4o-mini':
self.model_type = ModelType.GPT_4O_MINI
else:
raise ValueError(f"Invalid model type: {model}")
def __init__(self, model: Optional[BaseModelBackend] = None):
self.model = model
def _construct_image_url(self, image_path: str) -> str:
parsed_url = urlparse(image_path)
@ -66,78 +60,78 @@ class ImageAnalysisToolkit(BaseToolkit):
return base64.b64encode(image_file.read()).decode("utf-8")
def _judge_if_write_code(self, question: str, image_path: str) -> Tuple[bool, str]:
# def _judge_if_write_code(self, question: str, image_path: str) -> Tuple[bool, str]:
_image_url = self._construct_image_url(image_path)
# _image_url = self._construct_image_url(image_path)
prompt = f"""
Given the question <question>{question}</question>, do you think it is suitable to write python code (using libraries like cv2) to process the image to get the answer?
Your output should be in json format (```json ```) including the following fields:
- `image_caption`: str, A detailed caption about the image. If it is suitable for writing code, it should contains helpful instructions and necessary informations for how to writing code.
- `if_write_code`: bool, True if it is suitable to write code to process the image, False otherwise.
"""
# prompt = f"""
# Given the question <question>{question}</question>, do you think it is suitable to write python code (using libraries like cv2) to process the image to get the answer?
# Your output should be in json format (```json ```) including the following fields:
# - `image_caption`: str, A detailed caption about the image. If it is suitable for writing code, it should contains helpful instructions and necessary informations for how to writing code.
# - `if_write_code`: bool, True if it is suitable to write code to process the image, False otherwise.
# """
messages = [
{
"role": "system",
"content": "You are a helpful assistant for image relevant tasks, and can judge whether \
the given image is suitable for writing code to process or not. "
},
{
"role": "user",
"content": [
{'type': 'text', 'text': prompt},
{
'type': 'image_url',
'image_url': {
'url': _image_url,
},
},
],
},
]
# messages = [
# {
# "role": "system",
# "content": "You are a helpful assistant for image relevant tasks, and can judge whether \
# the given image is suitable for writing code to process or not. "
# },
# {
# "role": "user",
# "content": [
# {'type': 'text', 'text': prompt},
# {
# 'type': 'image_url',
# 'image_url': {
# 'url': _image_url,
# },
# },
# ],
# },
# ]
LLM = OpenAIModel(model_type=self.model_type)
resp = LLM.run(messages)
# LLM = OpenAIModel(model_type=self.model_type)
# resp = LLM.run(messages)
result_str = resp.choices[0].message.content.lower()
result_str = result_str.replace("```json", "").replace("```", "").strip()
# result_str = resp.choices[0].message.content.lower()
# result_str = result_str.replace("```json", "").replace("```", "").strip()
result_dict = json.loads(result_str)
# result_dict = json.loads(result_str)
if_write_code = result_dict.get("if_write_code", False)
image_caption = result_dict.get("image_caption", "")
# if_write_code = result_dict.get("if_write_code", False)
# image_caption = result_dict.get("image_caption", "")
return if_write_code, image_caption
# return if_write_code, image_caption
def _get_image_caption(self, image_path: str) -> str:
# def _get_image_caption(self, image_path: str) -> str:
_image_url = self._construct_image_url(image_path)
# _image_url = self._construct_image_url(image_path)
prompt = f"""
Please make a detailed description about the image.
"""
# prompt = f"""
# Please make a detailed description about the image.
# """
messages = [
{
"role": "user",
"content": [
{'type': 'text', 'text': prompt},
{
'type': 'image_url',
'image_url': {
'url': _image_url,
},
},
],
},
]
# messages = [
# {
# "role": "user",
# "content": [
# {'type': 'text', 'text': prompt},
# {
# 'type': 'image_url',
# 'image_url': {
# 'url': _image_url,
# },
# },
# ],
# },
# ]
LLM = OpenAIModel(model_type=self.model_type)
resp = LLM.run(messages)
# LLM = OpenAIModel(model_type=self.model_type)
# resp = LLM.run(messages)
return resp.choices[0].message.content
# return resp.choices[0].message.content
def ask_question_about_image(self, image_path: str, question: str) -> str:
@ -175,28 +169,24 @@ class ImageAnalysisToolkit(BaseToolkit):
# f"data:image/jpeg;base64,{self._encode_image(image_path)}"
# )
model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=self.model_type,
)
code_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.O3_MINI,
)
# code_model = ModelFactory.create(
# model_platform=ModelPlatformType.OPENAI,
# model_type=ModelType.O3_MINI,
# )
code_execution_toolkit = CodeExecutionToolkit(require_confirm=False, sandbox="subprocess", verbose=True)
# code_execution_toolkit = CodeExecutionToolkit(require_confirm=False, sandbox="subprocess", verbose=True)
image_agent = ChatAgent(
"You are a helpful assistant for image relevant tasks. Given a question related to the image, you can carefully check the image in detail and answer the question.",
model,
self.model,
)
code_agent = ChatAgent(
"You are an expert of writing code to process special images leveraging libraries like cv2.",
code_model,
tools=code_execution_toolkit.get_tools(),
)
# code_agent = ChatAgent(
# "You are an expert of writing code to process special images leveraging libraries like cv2.",
# code_model,
# tools=code_execution_toolkit.get_tools(),
# )
if not is_url:
image_object = Image.open(image_path)

View File

@ -26,6 +26,7 @@ from retry import retry
from camel.toolkits.base import BaseToolkit
from camel.toolkits import FunctionTool
from camel.messages import BaseMessage
from camel.models import BaseModelBackend
from camel.agents import ChatAgent
from camel.models import ModelFactory
from camel.types import ModelType, ModelPlatformType
@ -37,6 +38,9 @@ class SearchToolkit(BaseToolkit):
search engines like Google, DuckDuckGo, Wikipedia and Wolfram Alpha, Brave.
"""
def __init__(self, model: Optional[BaseModelBackend] = None):
self.model = model
@dependencies_required("wikipedia")
@retry(ConnectionError, delay=3)
def search_wiki(self, entity: str) -> str:
@ -698,15 +702,9 @@ class SearchToolkit(BaseToolkit):
The search result containing url and necessary information.
"""
model = ModelFactory.create(
model_type=ModelType.GPT_4O_MINI,
model_platform=ModelPlatformType.OPENAI,
model_config_dict={"temperature": 0, "top_p": 1}
)
search_agent = ChatAgent(
"You are a helpful search agent.",
model=model,
model=self.model,
tools=[FunctionTool(self.search_wiki), FunctionTool(self.search_google), FunctionTool(self.search_archived_webpage)]
)

View File

@ -14,7 +14,7 @@ from camel.toolkits.base import BaseToolkit
from camel.toolkits import FunctionTool, VideoAnalysisToolkit
from camel.messages import BaseMessage
from camel.agents import ChatAgent
from camel.models import ModelFactory
from camel.models import ModelFactory, BaseModelBackend
from camel.types import ModelType, ModelPlatformType
import io
@ -717,8 +717,9 @@ class WebToolkit(BaseToolkit):
headless=True,
cache_dir: Optional[str] = None,
page_script_path: Optional[str] = None,
model: Literal['gpt-4o', 'gpt-4o-mini'] = 'gpt-4o',
history_window: int = 5
history_window: int = 5,
web_agent_model: Optional[BaseModelBackend] = None,
planning_agent_model: Optional[BaseModelBackend] = None,
):
self.browser = BaseBrowser(
@ -728,10 +729,12 @@ class WebToolkit(BaseToolkit):
)
self.history_window = history_window
self.web_agent_model = web_agent_model
self.planning_agent_model = planning_agent_model
self.history = []
# self.search_toolkit = SearchToolkit()
self.web_agent, self.planning_agent = self._initialize_agent(model)
self.web_agent, self.planning_agent = self._initialize_agent()
def _reset(self):
@ -741,28 +744,24 @@ class WebToolkit(BaseToolkit):
os.makedirs(self.browser.cache_dir, exist_ok=True)
def _initialize_agent(self, model: Literal['gpt-4o', 'gpt-4o-mini']) -> Tuple[ChatAgent, ChatAgent]:
def _initialize_agent(self) -> Tuple[ChatAgent, ChatAgent]:
r"""Initialize the agent."""
if model == 'gpt-4o':
if self.web_agent_model is None:
web_agent_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O,
model_config_dict={"temperature": 0, "top_p": 1}
)
elif model == 'gpt-4o-mini':
web_agent_model = ModelFactory.create(
else:
web_agent_model = self.web_agent_model
if self.planning_agent_model is None:
planning_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O_MINI,
model_config_dict={"temperature": 0, "top_p": 1}
model_type=ModelType.O3_MINI,
)
else:
raise ValueError("Invalid model type.")
planning_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.O3_MINI,
)
planning_model = self.planning_agent_model
system_prompt = """
You are a helpful web agent that can assist users in browsing the web.

View File

@ -149,6 +149,7 @@ class ModelType(UnifiedModelType, Enum):
QWEN_2_5_32B = "qwen2.5-32b-instruct"
QWEN_2_5_14B = "qwen2.5-14b-instruct"
QWEN_QWQ_32B = "qwq-32b-preview"
QWEN_OMNI_TURBO = "qwen-omni-turbo"
# Yi models (01-ai)
YI_LIGHTNING = "yi-lightning"
@ -404,6 +405,7 @@ class ModelType(UnifiedModelType, Enum):
ModelType.QWEN_2_5_32B,
ModelType.QWEN_2_5_14B,
ModelType.QWEN_QWQ_32B,
ModelType.QWEN_OMNI_TURBO,
}
@property
@ -502,6 +504,7 @@ class ModelType(UnifiedModelType, Enum):
ModelType.INTERNLM2_PRO_CHAT,
ModelType.TOGETHER_MIXTRAL_8_7B,
ModelType.SGLANG_MISTRAL_7B,
ModelType.QWEN_OMNI_TURBO,
}:
return 32_768
elif self in {

View File

@ -8,7 +8,7 @@ from dotenv import load_dotenv
from retry import retry
from loguru import logger
from utils import OwlRolePlaying, process_tools, run_society
from utils import OwlRolePlaying, run_society
import os
@ -32,30 +32,21 @@ def construct_society(question: str) -> OwlRolePlaying:
model_type=ModelType.GPT_4O,
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), # [Optional] the config for model
)
user_tools = []
assistant_tools = [
"WebToolkit",
'DocumentProcessingToolkit',
'VideoAnalysisToolkit',
'CodeExecutionToolkit',
'ImageAnalysisToolkit',
'AudioAnalysisToolkit',
"SearchToolkit",
"ExcelToolkit",
]
tools_list = [*WebToolkit(web_agent_model=assistant_model, planning_agent_model=assistant_model).get_tools(),
*DocumentProcessingToolkit().get_tools(),
*VideoAnalysisToolkit().get_tools(), # This requires OpenAI and Qwen Key
*CodeExecutionToolkit().get_tools(),
*ImageAnalysisToolkit(model=assistant_model).get_tools(),
*AudioAnalysisToolkit().get_tools(), # This requires OpenAI Key
*SearchToolkit(model=assistant_model).get_tools(),
*ExcelToolkit().get_tools()]
user_role_name = 'user'
user_agent_kwargs = {
'model': user_model,
'tools': process_tools(user_tools),
}
user_agent_kwargs = dict(model=user_model)
assistant_role_name = 'assistant'
assistant_agent_kwargs = {
'model': assistant_model,
'tools': process_tools(assistant_tools),
}
assistant_agent_kwargs = dict(model=assistant_model,
tools=tools_list)
task_kwargs = {
'task_prompt': question,

View File

@ -2,7 +2,7 @@ from camel.models import ModelFactory
from camel.toolkits import *
from camel.types import ModelPlatformType, ModelType
from camel.configs import ChatGPTConfig
from utils import GAIABenchmark, process_tools
from utils import GAIABenchmark
from dotenv import load_dotenv
from retry import retry
@ -36,28 +36,20 @@ def main():
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), # [Optional] the config for model
)
user_tools = []
assistant_tools = [
"WebToolkit",
'DocumentProcessingToolkit',
'VideoAnalysisToolkit',
'CodeExecutionToolkit',
'ImageAnalysisToolkit',
'AudioAnalysisToolkit',
"SearchToolkit",
"ExcelToolkit",
]
tools_list = [*WebToolkit(web_agent_model=assistant_model, planning_agent_model=assistant_model).get_tools(),
*DocumentProcessingToolkit().get_tools(),
*VideoAnalysisToolkit().get_tools(), # This requires OpenAI and Qwen Key
*CodeExecutionToolkit().get_tools(),
*ImageAnalysisToolkit(model=assistant_model).get_tools(),
*AudioAnalysisToolkit().get_tools(), # This requires OpenAI Key
*SearchToolkit(model=assistant_model).get_tools(),
*ExcelToolkit().get_tools()]
user_role_name = 'user'
user_agent_kwargs = {
'model': user_model,
'tools': process_tools(user_tools),
}
user_agent_kwargs = dict(model=user_model)
assistant_role_name = 'assistant'
assistant_agent_kwargs = {
'model': assistant_model,
'tools': process_tools(assistant_tools),
}
assistant_agent_kwargs = dict(model=assistant_model,
tools=tools_list)
benchmark = GAIABenchmark(
data_dir="data/gaia",

View File

@ -47,12 +47,12 @@ class OwlRolePlaying(RolePlaying):
self.assistant_sys_msg: Optional[BaseMessage]
self.user_sys_msg: Optional[BaseMessage]
self.is_reasoning_task = self._judge_if_reasoning_task(self.task_prompt)
# self.is_reasoning_task = self._judge_if_reasoning_task(self.task_prompt)
if self.is_reasoning_task:
logger.info("The task is judged as a reasoning or coding task. The assistant agent will use the reasoning model O3-MINI.")
else:
logger.info("The assistant agent will use the default model.")
# if self.is_reasoning_task:
# logger.info("The task is judged as a reasoning or coding task. The assistant agent will use the reasoning model O3-MINI.")
# else:
# logger.info("The assistant agent will use the default model.")
self._init_agents(
init_assistant_sys_msg,
@ -60,7 +60,7 @@ class OwlRolePlaying(RolePlaying):
assistant_agent_kwargs=self.assistant_agent_kwargs,
user_agent_kwargs=self.user_agent_kwargs,
output_language=self.output_language,
is_reasoning_task=self.is_reasoning_task
# is_reasoning_task=self.is_reasoning_task
)
@ -97,12 +97,12 @@ class OwlRolePlaying(RolePlaying):
elif 'model' not in user_agent_kwargs:
user_agent_kwargs.update(dict(model=self.model))
# If the task is a reasoning task, the assistant agent should use the reasoning model O3-MINI
if is_reasoning_task:
assistant_agent_kwargs['model'] = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.O3_MINI,
)
# # If the task is a reasoning task, the assistant agent should use the reasoning model O3-MINI
# if is_reasoning_task:
# assistant_agent_kwargs['model'] = ModelFactory.create(
# model_platform=ModelPlatformType.OPENAI,
# model_type=ModelType.O3_MINI,
# )
self.assistant_agent = ChatAgent(
init_assistant_sys_msg,
@ -119,25 +119,25 @@ class OwlRolePlaying(RolePlaying):
self.user_sys_msg = self.user_agent.system_message
def _judge_if_reasoning_task(self, question: str) -> bool:
r"""Judge if the question is a reasoning task."""
# def _judge_if_reasoning_task(self, question: str) -> bool:
# r"""Judge if the question is a reasoning task."""
LLM = OpenAIModel(model_type=ModelType.O3_MINI)
prompt = f"""
Please judge whether the following question is a reasoning or coding task, which can be solved by reasoning without leveraging external resources, or is suitable for writing code to solve the task.
If it is a reasoning or coding task, please return only "yes".
If it is not a reasoning or coding task, please return only "no".
Note:
- If the question required some world knowledge to answer the question, please carefully judge it, because the model's own knowledge is often unreliable.
- If it is suitable for writing codes (e.g. process excel files, write simulation codes, etc.), in most cases, it can be considered as a coding task.
Question: <question>{question}</question>
"""
messages = [{"role": "user", "content": prompt}]
resp = LLM.run(messages)
if 'yes' in resp.choices[0].message.content.lower():
return True
else:
return False
# LLM = OpenAIModel(model_type=ModelType.O3_MINI)
# prompt = f"""
# Please judge whether the following question is a reasoning or coding task, which can be solved by reasoning without leveraging external resources, or is suitable for writing code to solve the task.
# If it is a reasoning or coding task, please return only "yes".
# If it is not a reasoning or coding task, please return only "no".
# Note:
# - If the question required some world knowledge to answer the question, please carefully judge it, because the model's own knowledge is often unreliable.
# - If it is suitable for writing codes (e.g. process excel files, write simulation codes, etc.), in most cases, it can be considered as a coding task.
# Question: <question>{question}</question>
# """
# messages = [{"role": "user", "content": prompt}]
# resp = LLM.run(messages)
# if 'yes' in resp.choices[0].message.content.lower():
# return True
# else:
# return False
def _construct_gaia_sys_msgs(self):