diff --git a/README.md b/README.md
index 523616e..898c4fc 100644
--- a/README.md
+++ b/README.md
@@ -121,6 +121,8 @@ In the `owl/.env_example` file, you will find all the necessary API keys along w
1. *Copy and Rename*: Duplicate the `.env_example` file and rename the copy to `.env`.
2. *Fill in Your Keys*: Open the `.env` file and insert your API keys in the corresponding fields.
+> **Note**: For optimal performance, we strongly recommend using OpenAI models. Our experiments show that other models may result in significantly lower performance on complex tasks and benchmarks.
+
# 🚀 Quick Start
Run the following minimal example:
diff --git a/owl/camel/toolkits/document_processing_toolkit.py b/owl/camel/toolkits/document_processing_toolkit.py
index 9d89129..5e5e72f 100644
--- a/owl/camel/toolkits/document_processing_toolkit.py
+++ b/owl/camel/toolkits/document_processing_toolkit.py
@@ -35,7 +35,7 @@ class DocumentProcessingToolkit(BaseToolkit):
"""
def __init__(self, cache_dir: Optional[str] = None):
self.image_tool = ImageAnalysisToolkit()
- self.audio_tool = AudioAnalysisToolkit()
+ # self.audio_tool = AudioAnalysisToolkit()
self.excel_tool = ExcelToolkit()
self.cache_dir = "tmp/"
@@ -59,9 +59,9 @@ class DocumentProcessingToolkit(BaseToolkit):
res = self.image_tool.ask_question_about_image(document_path, "Please make a detailed caption about the image.")
return True, res
- if any(document_path.endswith(ext) for ext in ['.mp3', '.wav']):
- res = self.audio_tool.ask_question_about_audio(document_path, "Please transcribe the audio content to text.")
- return True, res
+ # if any(document_path.endswith(ext) for ext in ['.mp3', '.wav']):
+ # res = self.audio_tool.ask_question_about_audio(document_path, "Please transcribe the audio content to text.")
+ # return True, res
if any(document_path.endswith(ext) for ext in ['xls', 'xlsx']):
res = self.excel_tool.extract_excel_content(document_path)
diff --git a/owl/camel/toolkits/image_analysis_toolkit.py b/owl/camel/toolkits/image_analysis_toolkit.py
index 80913eb..3063508 100644
--- a/owl/camel/toolkits/image_analysis_toolkit.py
+++ b/owl/camel/toolkits/image_analysis_toolkit.py
@@ -15,7 +15,7 @@ import base64
import logging
import json
from PIL import Image
-from typing import List, Literal, Tuple
+from typing import List, Literal, Tuple, Optional
from urllib.parse import urlparse
from camel.agents import ChatAgent
@@ -23,7 +23,7 @@ from camel.configs import ChatGPTConfig
from camel.toolkits.base import BaseToolkit
from camel.toolkits import FunctionTool, CodeExecutionToolkit
from camel.types import ModelType, ModelPlatformType
-from camel.models import ModelFactory, OpenAIModel
+from camel.models import ModelFactory, OpenAIModel, BaseModelBackend
from camel.messages import BaseMessage
logger = logging.getLogger(__name__)
@@ -35,14 +35,8 @@ class ImageAnalysisToolkit(BaseToolkit):
This class provides methods for understanding images, such as identifying
objects, text in images.
"""
- def __init__(self, model: Literal['gpt-4o', 'gpt-4o-mini'] = 'gpt-4o'):
- self.model_type = ModelType.GPT_4O
- if model == 'gpt-4o':
- self.model_type = ModelType.GPT_4O
- elif model == 'gpt-4o-mini':
- self.model_type = ModelType.GPT_4O_MINI
- else:
- raise ValueError(f"Invalid model type: {model}")
+ def __init__(self, model: Optional[BaseModelBackend] = None):
+ self.model = model
def _construct_image_url(self, image_path: str) -> str:
parsed_url = urlparse(image_path)
@@ -66,78 +60,78 @@ class ImageAnalysisToolkit(BaseToolkit):
return base64.b64encode(image_file.read()).decode("utf-8")
- def _judge_if_write_code(self, question: str, image_path: str) -> Tuple[bool, str]:
+ # def _judge_if_write_code(self, question: str, image_path: str) -> Tuple[bool, str]:
- _image_url = self._construct_image_url(image_path)
+ # _image_url = self._construct_image_url(image_path)
- prompt = f"""
- Given the question {question}, do you think it is suitable to write python code (using libraries like cv2) to process the image to get the answer?
- Your output should be in json format (```json ```) including the following fields:
- - `image_caption`: str, A detailed caption about the image. If it is suitable for writing code, it should contains helpful instructions and necessary informations for how to writing code.
- - `if_write_code`: bool, True if it is suitable to write code to process the image, False otherwise.
- """
+ # prompt = f"""
+ # Given the question {question}, do you think it is suitable to write python code (using libraries like cv2) to process the image to get the answer?
+ # Your output should be in json format (```json ```) including the following fields:
+ # - `image_caption`: str, A detailed caption about the image. If it is suitable for writing code, it should contains helpful instructions and necessary informations for how to writing code.
+ # - `if_write_code`: bool, True if it is suitable to write code to process the image, False otherwise.
+ # """
- messages = [
- {
- "role": "system",
- "content": "You are a helpful assistant for image relevant tasks, and can judge whether \
- the given image is suitable for writing code to process or not. "
- },
- {
- "role": "user",
- "content": [
- {'type': 'text', 'text': prompt},
- {
- 'type': 'image_url',
- 'image_url': {
- 'url': _image_url,
- },
- },
- ],
- },
- ]
+ # messages = [
+ # {
+ # "role": "system",
+ # "content": "You are a helpful assistant for image relevant tasks, and can judge whether \
+ # the given image is suitable for writing code to process or not. "
+ # },
+ # {
+ # "role": "user",
+ # "content": [
+ # {'type': 'text', 'text': prompt},
+ # {
+ # 'type': 'image_url',
+ # 'image_url': {
+ # 'url': _image_url,
+ # },
+ # },
+ # ],
+ # },
+ # ]
- LLM = OpenAIModel(model_type=self.model_type)
- resp = LLM.run(messages)
+ # LLM = OpenAIModel(model_type=self.model_type)
+ # resp = LLM.run(messages)
- result_str = resp.choices[0].message.content.lower()
- result_str = result_str.replace("```json", "").replace("```", "").strip()
+ # result_str = resp.choices[0].message.content.lower()
+ # result_str = result_str.replace("```json", "").replace("```", "").strip()
- result_dict = json.loads(result_str)
+ # result_dict = json.loads(result_str)
- if_write_code = result_dict.get("if_write_code", False)
- image_caption = result_dict.get("image_caption", "")
+ # if_write_code = result_dict.get("if_write_code", False)
+ # image_caption = result_dict.get("image_caption", "")
- return if_write_code, image_caption
+ # return if_write_code, image_caption
- def _get_image_caption(self, image_path: str) -> str:
+ # def _get_image_caption(self, image_path: str) -> str:
- _image_url = self._construct_image_url(image_path)
+ # _image_url = self._construct_image_url(image_path)
- prompt = f"""
- Please make a detailed description about the image.
- """
+ # prompt = f"""
+ # Please make a detailed description about the image.
+ # """
- messages = [
- {
- "role": "user",
- "content": [
- {'type': 'text', 'text': prompt},
- {
- 'type': 'image_url',
- 'image_url': {
- 'url': _image_url,
- },
- },
- ],
- },
- ]
+ # messages = [
+ # {
+ # "role": "user",
+ # "content": [
+ # {'type': 'text', 'text': prompt},
+ # {
+ # 'type': 'image_url',
+ # 'image_url': {
+ # 'url': _image_url,
+ # },
+ # },
+ # ],
+ # },
+ # ]
- LLM = OpenAIModel(model_type=self.model_type)
- resp = LLM.run(messages)
+ # LLM = OpenAIModel(model_type=self.model_type)
+ # resp = LLM.run(messages)
- return resp.choices[0].message.content
+ # return resp.choices[0].message.content
def ask_question_about_image(self, image_path: str, question: str) -> str:
@@ -175,28 +169,24 @@ class ImageAnalysisToolkit(BaseToolkit):
# f"data:image/jpeg;base64,{self._encode_image(image_path)}"
# )
- model = ModelFactory.create(
- model_platform=ModelPlatformType.OPENAI,
- model_type=self.model_type,
- )
- code_model = ModelFactory.create(
- model_platform=ModelPlatformType.OPENAI,
- model_type=ModelType.O3_MINI,
- )
+ # code_model = ModelFactory.create(
+ # model_platform=ModelPlatformType.OPENAI,
+ # model_type=ModelType.O3_MINI,
+ # )
- code_execution_toolkit = CodeExecutionToolkit(require_confirm=False, sandbox="subprocess", verbose=True)
+ # code_execution_toolkit = CodeExecutionToolkit(require_confirm=False, sandbox="subprocess", verbose=True)
image_agent = ChatAgent(
"You are a helpful assistant for image relevant tasks. Given a question related to the image, you can carefully check the image in detail and answer the question.",
- model,
+ self.model,
)
- code_agent = ChatAgent(
- "You are an expert of writing code to process special images leveraging libraries like cv2.",
- code_model,
- tools=code_execution_toolkit.get_tools(),
- )
+ # code_agent = ChatAgent(
+ # "You are an expert of writing code to process special images leveraging libraries like cv2.",
+ # code_model,
+ # tools=code_execution_toolkit.get_tools(),
+ # )
if not is_url:
image_object = Image.open(image_path)
diff --git a/owl/camel/toolkits/search_toolkit.py b/owl/camel/toolkits/search_toolkit.py
index c2b0405..3df7533 100644
--- a/owl/camel/toolkits/search_toolkit.py
+++ b/owl/camel/toolkits/search_toolkit.py
@@ -26,6 +26,7 @@ from retry import retry
from camel.toolkits.base import BaseToolkit
from camel.toolkits import FunctionTool
from camel.messages import BaseMessage
+from camel.models import BaseModelBackend
from camel.agents import ChatAgent
from camel.models import ModelFactory
from camel.types import ModelType, ModelPlatformType
@@ -37,6 +38,9 @@ class SearchToolkit(BaseToolkit):
search engines like Google, DuckDuckGo, Wikipedia and Wolfram Alpha, Brave.
"""
+ def __init__(self, model: Optional[BaseModelBackend] = None):
+ self.model = model
+
@dependencies_required("wikipedia")
@retry(ConnectionError, delay=3)
def search_wiki(self, entity: str) -> str:
@@ -698,15 +702,9 @@ class SearchToolkit(BaseToolkit):
The search result containing url and necessary information.
"""
- model = ModelFactory.create(
- model_type=ModelType.GPT_4O_MINI,
- model_platform=ModelPlatformType.OPENAI,
- model_config_dict={"temperature": 0, "top_p": 1}
- )
-
search_agent = ChatAgent(
"You are a helpful search agent.",
- model=model,
+ model=self.model,
tools=[FunctionTool(self.search_wiki), FunctionTool(self.search_google), FunctionTool(self.search_archived_webpage)]
)
diff --git a/owl/camel/toolkits/web_toolkit.py b/owl/camel/toolkits/web_toolkit.py
index 6a4d13d..09f0bf2 100644
--- a/owl/camel/toolkits/web_toolkit.py
+++ b/owl/camel/toolkits/web_toolkit.py
@@ -14,7 +14,7 @@ from camel.toolkits.base import BaseToolkit
from camel.toolkits import FunctionTool, VideoAnalysisToolkit
from camel.messages import BaseMessage
from camel.agents import ChatAgent
-from camel.models import ModelFactory
+from camel.models import ModelFactory, BaseModelBackend
from camel.types import ModelType, ModelPlatformType
import io
@@ -717,8 +717,9 @@ class WebToolkit(BaseToolkit):
headless=True,
cache_dir: Optional[str] = None,
page_script_path: Optional[str] = None,
- model: Literal['gpt-4o', 'gpt-4o-mini'] = 'gpt-4o',
- history_window: int = 5
+ history_window: int = 5,
+ web_agent_model: Optional[BaseModelBackend] = None,
+ planning_agent_model: Optional[BaseModelBackend] = None,
):
self.browser = BaseBrowser(
@@ -728,10 +729,12 @@ class WebToolkit(BaseToolkit):
)
self.history_window = history_window
+ self.web_agent_model = web_agent_model
+ self.planning_agent_model = planning_agent_model
self.history = []
# self.search_toolkit = SearchToolkit()
- self.web_agent, self.planning_agent = self._initialize_agent(model)
+ self.web_agent, self.planning_agent = self._initialize_agent()
def _reset(self):
@@ -741,28 +744,24 @@ class WebToolkit(BaseToolkit):
os.makedirs(self.browser.cache_dir, exist_ok=True)
- def _initialize_agent(self, model: Literal['gpt-4o', 'gpt-4o-mini']) -> Tuple[ChatAgent, ChatAgent]:
+ def _initialize_agent(self) -> Tuple[ChatAgent, ChatAgent]:
r"""Initialize the agent."""
- if model == 'gpt-4o':
+ if self.web_agent_model is None:
web_agent_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O,
model_config_dict={"temperature": 0, "top_p": 1}
)
- elif model == 'gpt-4o-mini':
- web_agent_model = ModelFactory.create(
+ else:
+ web_agent_model = self.web_agent_model
+
+ if self.planning_agent_model is None:
+ planning_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
- model_type=ModelType.GPT_4O_MINI,
- model_config_dict={"temperature": 0, "top_p": 1}
+ model_type=ModelType.O3_MINI,
)
else:
- raise ValueError("Invalid model type.")
-
- planning_model = ModelFactory.create(
- model_platform=ModelPlatformType.OPENAI,
- model_type=ModelType.O3_MINI,
- )
-
+ planning_model = self.planning_agent_model
system_prompt = """
You are a helpful web agent that can assist users in browsing the web.
diff --git a/owl/camel/types/enums.py b/owl/camel/types/enums.py
index 3d8e651..c1d69f8 100644
--- a/owl/camel/types/enums.py
+++ b/owl/camel/types/enums.py
@@ -149,6 +149,7 @@ class ModelType(UnifiedModelType, Enum):
QWEN_2_5_32B = "qwen2.5-32b-instruct"
QWEN_2_5_14B = "qwen2.5-14b-instruct"
QWEN_QWQ_32B = "qwq-32b-preview"
+ QWEN_OMNI_TURBO = "qwen-omni-turbo"
# Yi models (01-ai)
YI_LIGHTNING = "yi-lightning"
@@ -404,6 +405,7 @@ class ModelType(UnifiedModelType, Enum):
ModelType.QWEN_2_5_32B,
ModelType.QWEN_2_5_14B,
ModelType.QWEN_QWQ_32B,
+ ModelType.QWEN_OMNI_TURBO,
}
@property
@@ -502,6 +504,7 @@ class ModelType(UnifiedModelType, Enum):
ModelType.INTERNLM2_PRO_CHAT,
ModelType.TOGETHER_MIXTRAL_8_7B,
ModelType.SGLANG_MISTRAL_7B,
+ ModelType.QWEN_OMNI_TURBO,
}:
return 32_768
elif self in {
diff --git a/owl/run.py b/owl/run.py
index 39e74ae..4b87651 100644
--- a/owl/run.py
+++ b/owl/run.py
@@ -8,7 +8,7 @@ from dotenv import load_dotenv
from retry import retry
from loguru import logger
-from utils import OwlRolePlaying, process_tools, run_society
+from utils import OwlRolePlaying, run_society
import os
@@ -32,30 +32,27 @@ def construct_society(question: str) -> OwlRolePlaying:
model_type=ModelType.GPT_4O,
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), # [Optional] the config for model
)
-
-
- user_tools = []
- assistant_tools = [
- "WebToolkit",
- 'DocumentProcessingToolkit',
- 'VideoAnalysisToolkit',
- 'CodeExecutionToolkit',
- 'ImageAnalysisToolkit',
- 'AudioAnalysisToolkit',
- "SearchToolkit",
- "ExcelToolkit",
- ]
+
+ tools_list = [
+ *WebToolkit(
+ headless=False,
+ web_agent_model=assistant_model,
+ planning_agent_model=assistant_model
+ ).get_tools(),
+ *DocumentProcessingToolkit().get_tools(),
+ *VideoAnalysisToolkit().get_tools(), # This requires OpenAI and Qwen Key
+ *CodeExecutionToolkit().get_tools(),
+ *ImageAnalysisToolkit(model=assistant_model).get_tools(),
+ *AudioAnalysisToolkit().get_tools(), # This requires OpenAI Key
+ *SearchToolkit(model=assistant_model).get_tools(),
+ *ExcelToolkit().get_tools()
+ ]
user_role_name = 'user'
- user_agent_kwargs = {
- 'model': user_model,
- 'tools': process_tools(user_tools),
- }
+ user_agent_kwargs = dict(model=user_model)
assistant_role_name = 'assistant'
- assistant_agent_kwargs = {
- 'model': assistant_model,
- 'tools': process_tools(assistant_tools),
- }
+ assistant_agent_kwargs = dict(model=assistant_model,
+ tools=tools_list)
task_kwargs = {
'task_prompt': question,
diff --git a/owl/run_gaia_roleplaying.py b/owl/run_gaia_roleplaying.py
index 1f5cd78..4c6bb90 100644
--- a/owl/run_gaia_roleplaying.py
+++ b/owl/run_gaia_roleplaying.py
@@ -2,7 +2,7 @@ from camel.models import ModelFactory
from camel.toolkits import *
from camel.types import ModelPlatformType, ModelType
from camel.configs import ChatGPTConfig
-from utils import GAIABenchmark, process_tools
+from utils import GAIABenchmark
from dotenv import load_dotenv
from retry import retry
@@ -36,28 +36,26 @@ def main():
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), # [Optional] the config for model
)
- user_tools = []
- assistant_tools = [
- "WebToolkit",
- 'DocumentProcessingToolkit',
- 'VideoAnalysisToolkit',
- 'CodeExecutionToolkit',
- 'ImageAnalysisToolkit',
- 'AudioAnalysisToolkit',
- "SearchToolkit",
- "ExcelToolkit",
- ]
+ tools_list = [
+ *WebToolkit(
+ headless=False,
+ web_agent_model=assistant_model,
+ planning_agent_model=assistant_model
+ ).get_tools(),
+ *DocumentProcessingToolkit().get_tools(),
+ *VideoAnalysisToolkit().get_tools(), # This requires OpenAI and Qwen Key
+ *CodeExecutionToolkit().get_tools(),
+ *ImageAnalysisToolkit(model=assistant_model).get_tools(),
+ *AudioAnalysisToolkit().get_tools(), # This requires OpenAI Key
+ *SearchToolkit(model=assistant_model).get_tools(),
+ *ExcelToolkit().get_tools()
+ ]
user_role_name = 'user'
- user_agent_kwargs = {
- 'model': user_model,
- 'tools': process_tools(user_tools),
- }
+ user_agent_kwargs = dict(model=user_model)
assistant_role_name = 'assistant'
- assistant_agent_kwargs = {
- 'model': assistant_model,
- 'tools': process_tools(assistant_tools),
- }
+ assistant_agent_kwargs = dict(model=assistant_model,
+ tools=tools_list)
benchmark = GAIABenchmark(
data_dir="data/gaia",
@@ -85,4 +83,3 @@ def main():
if __name__ == "__main__":
main()
-
diff --git a/owl/utils/enhanced_role_playing.py b/owl/utils/enhanced_role_playing.py
index 38533dc..eac8c51 100644
--- a/owl/utils/enhanced_role_playing.py
+++ b/owl/utils/enhanced_role_playing.py
@@ -47,12 +47,12 @@ class OwlRolePlaying(RolePlaying):
self.assistant_sys_msg: Optional[BaseMessage]
self.user_sys_msg: Optional[BaseMessage]
- self.is_reasoning_task = self._judge_if_reasoning_task(self.task_prompt)
+ # self.is_reasoning_task = self._judge_if_reasoning_task(self.task_prompt)
- if self.is_reasoning_task:
- logger.info("The task is judged as a reasoning or coding task. The assistant agent will use the reasoning model O3-MINI.")
- else:
- logger.info("The assistant agent will use the default model.")
+ # if self.is_reasoning_task:
+ # logger.info("The task is judged as a reasoning or coding task. The assistant agent will use the reasoning model O3-MINI.")
+ # else:
+ # logger.info("The assistant agent will use the default model.")
self._init_agents(
init_assistant_sys_msg,
@@ -60,7 +60,7 @@ class OwlRolePlaying(RolePlaying):
assistant_agent_kwargs=self.assistant_agent_kwargs,
user_agent_kwargs=self.user_agent_kwargs,
output_language=self.output_language,
- is_reasoning_task=self.is_reasoning_task
+ # is_reasoning_task=self.is_reasoning_task
)
@@ -97,12 +97,12 @@ class OwlRolePlaying(RolePlaying):
elif 'model' not in user_agent_kwargs:
user_agent_kwargs.update(dict(model=self.model))
- # If the task is a reasoning task, the assistant agent should use the reasoning model O3-MINI
- if is_reasoning_task:
- assistant_agent_kwargs['model'] = ModelFactory.create(
- model_platform=ModelPlatformType.OPENAI,
- model_type=ModelType.O3_MINI,
- )
+ # # If the task is a reasoning task, the assistant agent should use the reasoning model O3-MINI
+ # if is_reasoning_task:
+ # assistant_agent_kwargs['model'] = ModelFactory.create(
+ # model_platform=ModelPlatformType.OPENAI,
+ # model_type=ModelType.O3_MINI,
+ # )
self.assistant_agent = ChatAgent(
init_assistant_sys_msg,
@@ -119,25 +119,25 @@ class OwlRolePlaying(RolePlaying):
self.user_sys_msg = self.user_agent.system_message
- def _judge_if_reasoning_task(self, question: str) -> bool:
- r"""Judge if the question is a reasoning task."""
+ # def _judge_if_reasoning_task(self, question: str) -> bool:
+ # r"""Judge if the question is a reasoning task."""
- LLM = OpenAIModel(model_type=ModelType.O3_MINI)
- prompt = f"""
- Please judge whether the following question is a reasoning or coding task, which can be solved by reasoning without leveraging external resources, or is suitable for writing code to solve the task.
- If it is a reasoning or coding task, please return only "yes".
- If it is not a reasoning or coding task, please return only "no".
- Note:
- - If the question required some world knowledge to answer the question, please carefully judge it, because the model's own knowledge is often unreliable.
- - If it is suitable for writing codes (e.g. process excel files, write simulation codes, etc.), in most cases, it can be considered as a coding task.
- Question: {question}
- """
- messages = [{"role": "user", "content": prompt}]
- resp = LLM.run(messages)
- if 'yes' in resp.choices[0].message.content.lower():
- return True
- else:
- return False
+ # LLM = OpenAIModel(model_type=ModelType.O3_MINI)
+ # prompt = f"""
+ # Please judge whether the following question is a reasoning or coding task, which can be solved by reasoning without leveraging external resources, or is suitable for writing code to solve the task.
+ # If it is a reasoning or coding task, please return only "yes".
+ # If it is not a reasoning or coding task, please return only "no".
+ # Note:
+ # - If the question required some world knowledge to answer the question, please carefully judge it, because the model's own knowledge is often unreliable.
+ # - If it is suitable for writing codes (e.g. process excel files, write simulation codes, etc.), in most cases, it can be considered as a coding task.
+ # Question: {question}
+ # """
+ # messages = [{"role": "user", "content": prompt}]
+ # resp = LLM.run(messages)
+ # if 'yes' in resp.choices[0].message.content.lower():
+ # return True
+ # else:
+ # return False
def _construct_gaia_sys_msgs(self):