diff --git a/README.md b/README.md index 523616e..898c4fc 100644 --- a/README.md +++ b/README.md @@ -121,6 +121,8 @@ In the `owl/.env_example` file, you will find all the necessary API keys along w 1. *Copy and Rename*: Duplicate the `.env_example` file and rename the copy to `.env`. 2. *Fill in Your Keys*: Open the `.env` file and insert your API keys in the corresponding fields. +> **Note**: For optimal performance, we strongly recommend using OpenAI models. Our experiments show that other models may result in significantly lower performance on complex tasks and benchmarks. + # 🚀 Quick Start Run the following minimal example: diff --git a/owl/camel/toolkits/document_processing_toolkit.py b/owl/camel/toolkits/document_processing_toolkit.py index 9d89129..5e5e72f 100644 --- a/owl/camel/toolkits/document_processing_toolkit.py +++ b/owl/camel/toolkits/document_processing_toolkit.py @@ -35,7 +35,7 @@ class DocumentProcessingToolkit(BaseToolkit): """ def __init__(self, cache_dir: Optional[str] = None): self.image_tool = ImageAnalysisToolkit() - self.audio_tool = AudioAnalysisToolkit() + # self.audio_tool = AudioAnalysisToolkit() self.excel_tool = ExcelToolkit() self.cache_dir = "tmp/" @@ -59,9 +59,9 @@ class DocumentProcessingToolkit(BaseToolkit): res = self.image_tool.ask_question_about_image(document_path, "Please make a detailed caption about the image.") return True, res - if any(document_path.endswith(ext) for ext in ['.mp3', '.wav']): - res = self.audio_tool.ask_question_about_audio(document_path, "Please transcribe the audio content to text.") - return True, res + # if any(document_path.endswith(ext) for ext in ['.mp3', '.wav']): + # res = self.audio_tool.ask_question_about_audio(document_path, "Please transcribe the audio content to text.") + # return True, res if any(document_path.endswith(ext) for ext in ['xls', 'xlsx']): res = self.excel_tool.extract_excel_content(document_path) diff --git a/owl/camel/toolkits/image_analysis_toolkit.py b/owl/camel/toolkits/image_analysis_toolkit.py index 80913eb..3063508 100644 --- a/owl/camel/toolkits/image_analysis_toolkit.py +++ b/owl/camel/toolkits/image_analysis_toolkit.py @@ -15,7 +15,7 @@ import base64 import logging import json from PIL import Image -from typing import List, Literal, Tuple +from typing import List, Literal, Tuple, Optional from urllib.parse import urlparse from camel.agents import ChatAgent @@ -23,7 +23,7 @@ from camel.configs import ChatGPTConfig from camel.toolkits.base import BaseToolkit from camel.toolkits import FunctionTool, CodeExecutionToolkit from camel.types import ModelType, ModelPlatformType -from camel.models import ModelFactory, OpenAIModel +from camel.models import ModelFactory, OpenAIModel, BaseModelBackend from camel.messages import BaseMessage logger = logging.getLogger(__name__) @@ -35,14 +35,8 @@ class ImageAnalysisToolkit(BaseToolkit): This class provides methods for understanding images, such as identifying objects, text in images. """ - def __init__(self, model: Literal['gpt-4o', 'gpt-4o-mini'] = 'gpt-4o'): - self.model_type = ModelType.GPT_4O - if model == 'gpt-4o': - self.model_type = ModelType.GPT_4O - elif model == 'gpt-4o-mini': - self.model_type = ModelType.GPT_4O_MINI - else: - raise ValueError(f"Invalid model type: {model}") + def __init__(self, model: Optional[BaseModelBackend] = None): + self.model = model def _construct_image_url(self, image_path: str) -> str: parsed_url = urlparse(image_path) @@ -66,78 +60,78 @@ class ImageAnalysisToolkit(BaseToolkit): return base64.b64encode(image_file.read()).decode("utf-8") - def _judge_if_write_code(self, question: str, image_path: str) -> Tuple[bool, str]: + # def _judge_if_write_code(self, question: str, image_path: str) -> Tuple[bool, str]: - _image_url = self._construct_image_url(image_path) + # _image_url = self._construct_image_url(image_path) - prompt = f""" - Given the question {question}, do you think it is suitable to write python code (using libraries like cv2) to process the image to get the answer? - Your output should be in json format (```json ```) including the following fields: - - `image_caption`: str, A detailed caption about the image. If it is suitable for writing code, it should contains helpful instructions and necessary informations for how to writing code. - - `if_write_code`: bool, True if it is suitable to write code to process the image, False otherwise. - """ + # prompt = f""" + # Given the question {question}, do you think it is suitable to write python code (using libraries like cv2) to process the image to get the answer? + # Your output should be in json format (```json ```) including the following fields: + # - `image_caption`: str, A detailed caption about the image. If it is suitable for writing code, it should contains helpful instructions and necessary informations for how to writing code. + # - `if_write_code`: bool, True if it is suitable to write code to process the image, False otherwise. + # """ - messages = [ - { - "role": "system", - "content": "You are a helpful assistant for image relevant tasks, and can judge whether \ - the given image is suitable for writing code to process or not. " - }, - { - "role": "user", - "content": [ - {'type': 'text', 'text': prompt}, - { - 'type': 'image_url', - 'image_url': { - 'url': _image_url, - }, - }, - ], - }, - ] + # messages = [ + # { + # "role": "system", + # "content": "You are a helpful assistant for image relevant tasks, and can judge whether \ + # the given image is suitable for writing code to process or not. " + # }, + # { + # "role": "user", + # "content": [ + # {'type': 'text', 'text': prompt}, + # { + # 'type': 'image_url', + # 'image_url': { + # 'url': _image_url, + # }, + # }, + # ], + # }, + # ] - LLM = OpenAIModel(model_type=self.model_type) - resp = LLM.run(messages) + # LLM = OpenAIModel(model_type=self.model_type) + # resp = LLM.run(messages) - result_str = resp.choices[0].message.content.lower() - result_str = result_str.replace("```json", "").replace("```", "").strip() + # result_str = resp.choices[0].message.content.lower() + # result_str = result_str.replace("```json", "").replace("```", "").strip() - result_dict = json.loads(result_str) + # result_dict = json.loads(result_str) - if_write_code = result_dict.get("if_write_code", False) - image_caption = result_dict.get("image_caption", "") + # if_write_code = result_dict.get("if_write_code", False) + # image_caption = result_dict.get("image_caption", "") - return if_write_code, image_caption + # return if_write_code, image_caption - def _get_image_caption(self, image_path: str) -> str: + # def _get_image_caption(self, image_path: str) -> str: - _image_url = self._construct_image_url(image_path) + # _image_url = self._construct_image_url(image_path) - prompt = f""" - Please make a detailed description about the image. - """ + # prompt = f""" + # Please make a detailed description about the image. + # """ - messages = [ - { - "role": "user", - "content": [ - {'type': 'text', 'text': prompt}, - { - 'type': 'image_url', - 'image_url': { - 'url': _image_url, - }, - }, - ], - }, - ] + # messages = [ + # { + # "role": "user", + # "content": [ + # {'type': 'text', 'text': prompt}, + # { + # 'type': 'image_url', + # 'image_url': { + # 'url': _image_url, + # }, + # }, + # ], + # }, + # ] - LLM = OpenAIModel(model_type=self.model_type) - resp = LLM.run(messages) + # LLM = OpenAIModel(model_type=self.model_type) + # resp = LLM.run(messages) - return resp.choices[0].message.content + # return resp.choices[0].message.content def ask_question_about_image(self, image_path: str, question: str) -> str: @@ -175,28 +169,24 @@ class ImageAnalysisToolkit(BaseToolkit): # f"data:image/jpeg;base64,{self._encode_image(image_path)}" # ) - model = ModelFactory.create( - model_platform=ModelPlatformType.OPENAI, - model_type=self.model_type, - ) - code_model = ModelFactory.create( - model_platform=ModelPlatformType.OPENAI, - model_type=ModelType.O3_MINI, - ) + # code_model = ModelFactory.create( + # model_platform=ModelPlatformType.OPENAI, + # model_type=ModelType.O3_MINI, + # ) - code_execution_toolkit = CodeExecutionToolkit(require_confirm=False, sandbox="subprocess", verbose=True) + # code_execution_toolkit = CodeExecutionToolkit(require_confirm=False, sandbox="subprocess", verbose=True) image_agent = ChatAgent( "You are a helpful assistant for image relevant tasks. Given a question related to the image, you can carefully check the image in detail and answer the question.", - model, + self.model, ) - code_agent = ChatAgent( - "You are an expert of writing code to process special images leveraging libraries like cv2.", - code_model, - tools=code_execution_toolkit.get_tools(), - ) + # code_agent = ChatAgent( + # "You are an expert of writing code to process special images leveraging libraries like cv2.", + # code_model, + # tools=code_execution_toolkit.get_tools(), + # ) if not is_url: image_object = Image.open(image_path) diff --git a/owl/camel/toolkits/search_toolkit.py b/owl/camel/toolkits/search_toolkit.py index c2b0405..3df7533 100644 --- a/owl/camel/toolkits/search_toolkit.py +++ b/owl/camel/toolkits/search_toolkit.py @@ -26,6 +26,7 @@ from retry import retry from camel.toolkits.base import BaseToolkit from camel.toolkits import FunctionTool from camel.messages import BaseMessage +from camel.models import BaseModelBackend from camel.agents import ChatAgent from camel.models import ModelFactory from camel.types import ModelType, ModelPlatformType @@ -37,6 +38,9 @@ class SearchToolkit(BaseToolkit): search engines like Google, DuckDuckGo, Wikipedia and Wolfram Alpha, Brave. """ + def __init__(self, model: Optional[BaseModelBackend] = None): + self.model = model + @dependencies_required("wikipedia") @retry(ConnectionError, delay=3) def search_wiki(self, entity: str) -> str: @@ -698,15 +702,9 @@ class SearchToolkit(BaseToolkit): The search result containing url and necessary information. """ - model = ModelFactory.create( - model_type=ModelType.GPT_4O_MINI, - model_platform=ModelPlatformType.OPENAI, - model_config_dict={"temperature": 0, "top_p": 1} - ) - search_agent = ChatAgent( "You are a helpful search agent.", - model=model, + model=self.model, tools=[FunctionTool(self.search_wiki), FunctionTool(self.search_google), FunctionTool(self.search_archived_webpage)] ) diff --git a/owl/camel/toolkits/web_toolkit.py b/owl/camel/toolkits/web_toolkit.py index 6a4d13d..09f0bf2 100644 --- a/owl/camel/toolkits/web_toolkit.py +++ b/owl/camel/toolkits/web_toolkit.py @@ -14,7 +14,7 @@ from camel.toolkits.base import BaseToolkit from camel.toolkits import FunctionTool, VideoAnalysisToolkit from camel.messages import BaseMessage from camel.agents import ChatAgent -from camel.models import ModelFactory +from camel.models import ModelFactory, BaseModelBackend from camel.types import ModelType, ModelPlatformType import io @@ -717,8 +717,9 @@ class WebToolkit(BaseToolkit): headless=True, cache_dir: Optional[str] = None, page_script_path: Optional[str] = None, - model: Literal['gpt-4o', 'gpt-4o-mini'] = 'gpt-4o', - history_window: int = 5 + history_window: int = 5, + web_agent_model: Optional[BaseModelBackend] = None, + planning_agent_model: Optional[BaseModelBackend] = None, ): self.browser = BaseBrowser( @@ -728,10 +729,12 @@ class WebToolkit(BaseToolkit): ) self.history_window = history_window + self.web_agent_model = web_agent_model + self.planning_agent_model = planning_agent_model self.history = [] # self.search_toolkit = SearchToolkit() - self.web_agent, self.planning_agent = self._initialize_agent(model) + self.web_agent, self.planning_agent = self._initialize_agent() def _reset(self): @@ -741,28 +744,24 @@ class WebToolkit(BaseToolkit): os.makedirs(self.browser.cache_dir, exist_ok=True) - def _initialize_agent(self, model: Literal['gpt-4o', 'gpt-4o-mini']) -> Tuple[ChatAgent, ChatAgent]: + def _initialize_agent(self) -> Tuple[ChatAgent, ChatAgent]: r"""Initialize the agent.""" - if model == 'gpt-4o': + if self.web_agent_model is None: web_agent_model = ModelFactory.create( model_platform=ModelPlatformType.OPENAI, model_type=ModelType.GPT_4O, model_config_dict={"temperature": 0, "top_p": 1} ) - elif model == 'gpt-4o-mini': - web_agent_model = ModelFactory.create( + else: + web_agent_model = self.web_agent_model + + if self.planning_agent_model is None: + planning_model = ModelFactory.create( model_platform=ModelPlatformType.OPENAI, - model_type=ModelType.GPT_4O_MINI, - model_config_dict={"temperature": 0, "top_p": 1} + model_type=ModelType.O3_MINI, ) else: - raise ValueError("Invalid model type.") - - planning_model = ModelFactory.create( - model_platform=ModelPlatformType.OPENAI, - model_type=ModelType.O3_MINI, - ) - + planning_model = self.planning_agent_model system_prompt = """ You are a helpful web agent that can assist users in browsing the web. diff --git a/owl/camel/types/enums.py b/owl/camel/types/enums.py index 3d8e651..c1d69f8 100644 --- a/owl/camel/types/enums.py +++ b/owl/camel/types/enums.py @@ -149,6 +149,7 @@ class ModelType(UnifiedModelType, Enum): QWEN_2_5_32B = "qwen2.5-32b-instruct" QWEN_2_5_14B = "qwen2.5-14b-instruct" QWEN_QWQ_32B = "qwq-32b-preview" + QWEN_OMNI_TURBO = "qwen-omni-turbo" # Yi models (01-ai) YI_LIGHTNING = "yi-lightning" @@ -404,6 +405,7 @@ class ModelType(UnifiedModelType, Enum): ModelType.QWEN_2_5_32B, ModelType.QWEN_2_5_14B, ModelType.QWEN_QWQ_32B, + ModelType.QWEN_OMNI_TURBO, } @property @@ -502,6 +504,7 @@ class ModelType(UnifiedModelType, Enum): ModelType.INTERNLM2_PRO_CHAT, ModelType.TOGETHER_MIXTRAL_8_7B, ModelType.SGLANG_MISTRAL_7B, + ModelType.QWEN_OMNI_TURBO, }: return 32_768 elif self in { diff --git a/owl/run.py b/owl/run.py index 39e74ae..4b87651 100644 --- a/owl/run.py +++ b/owl/run.py @@ -8,7 +8,7 @@ from dotenv import load_dotenv from retry import retry from loguru import logger -from utils import OwlRolePlaying, process_tools, run_society +from utils import OwlRolePlaying, run_society import os @@ -32,30 +32,27 @@ def construct_society(question: str) -> OwlRolePlaying: model_type=ModelType.GPT_4O, model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), # [Optional] the config for model ) - - - user_tools = [] - assistant_tools = [ - "WebToolkit", - 'DocumentProcessingToolkit', - 'VideoAnalysisToolkit', - 'CodeExecutionToolkit', - 'ImageAnalysisToolkit', - 'AudioAnalysisToolkit', - "SearchToolkit", - "ExcelToolkit", - ] + + tools_list = [ + *WebToolkit( + headless=False, + web_agent_model=assistant_model, + planning_agent_model=assistant_model + ).get_tools(), + *DocumentProcessingToolkit().get_tools(), + *VideoAnalysisToolkit().get_tools(), # This requires OpenAI and Qwen Key + *CodeExecutionToolkit().get_tools(), + *ImageAnalysisToolkit(model=assistant_model).get_tools(), + *AudioAnalysisToolkit().get_tools(), # This requires OpenAI Key + *SearchToolkit(model=assistant_model).get_tools(), + *ExcelToolkit().get_tools() + ] user_role_name = 'user' - user_agent_kwargs = { - 'model': user_model, - 'tools': process_tools(user_tools), - } + user_agent_kwargs = dict(model=user_model) assistant_role_name = 'assistant' - assistant_agent_kwargs = { - 'model': assistant_model, - 'tools': process_tools(assistant_tools), - } + assistant_agent_kwargs = dict(model=assistant_model, + tools=tools_list) task_kwargs = { 'task_prompt': question, diff --git a/owl/run_gaia_roleplaying.py b/owl/run_gaia_roleplaying.py index 1f5cd78..4c6bb90 100644 --- a/owl/run_gaia_roleplaying.py +++ b/owl/run_gaia_roleplaying.py @@ -2,7 +2,7 @@ from camel.models import ModelFactory from camel.toolkits import * from camel.types import ModelPlatformType, ModelType from camel.configs import ChatGPTConfig -from utils import GAIABenchmark, process_tools +from utils import GAIABenchmark from dotenv import load_dotenv from retry import retry @@ -36,28 +36,26 @@ def main(): model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), # [Optional] the config for model ) - user_tools = [] - assistant_tools = [ - "WebToolkit", - 'DocumentProcessingToolkit', - 'VideoAnalysisToolkit', - 'CodeExecutionToolkit', - 'ImageAnalysisToolkit', - 'AudioAnalysisToolkit', - "SearchToolkit", - "ExcelToolkit", - ] + tools_list = [ + *WebToolkit( + headless=False, + web_agent_model=assistant_model, + planning_agent_model=assistant_model + ).get_tools(), + *DocumentProcessingToolkit().get_tools(), + *VideoAnalysisToolkit().get_tools(), # This requires OpenAI and Qwen Key + *CodeExecutionToolkit().get_tools(), + *ImageAnalysisToolkit(model=assistant_model).get_tools(), + *AudioAnalysisToolkit().get_tools(), # This requires OpenAI Key + *SearchToolkit(model=assistant_model).get_tools(), + *ExcelToolkit().get_tools() + ] user_role_name = 'user' - user_agent_kwargs = { - 'model': user_model, - 'tools': process_tools(user_tools), - } + user_agent_kwargs = dict(model=user_model) assistant_role_name = 'assistant' - assistant_agent_kwargs = { - 'model': assistant_model, - 'tools': process_tools(assistant_tools), - } + assistant_agent_kwargs = dict(model=assistant_model, + tools=tools_list) benchmark = GAIABenchmark( data_dir="data/gaia", @@ -85,4 +83,3 @@ def main(): if __name__ == "__main__": main() - diff --git a/owl/utils/enhanced_role_playing.py b/owl/utils/enhanced_role_playing.py index 38533dc..eac8c51 100644 --- a/owl/utils/enhanced_role_playing.py +++ b/owl/utils/enhanced_role_playing.py @@ -47,12 +47,12 @@ class OwlRolePlaying(RolePlaying): self.assistant_sys_msg: Optional[BaseMessage] self.user_sys_msg: Optional[BaseMessage] - self.is_reasoning_task = self._judge_if_reasoning_task(self.task_prompt) + # self.is_reasoning_task = self._judge_if_reasoning_task(self.task_prompt) - if self.is_reasoning_task: - logger.info("The task is judged as a reasoning or coding task. The assistant agent will use the reasoning model O3-MINI.") - else: - logger.info("The assistant agent will use the default model.") + # if self.is_reasoning_task: + # logger.info("The task is judged as a reasoning or coding task. The assistant agent will use the reasoning model O3-MINI.") + # else: + # logger.info("The assistant agent will use the default model.") self._init_agents( init_assistant_sys_msg, @@ -60,7 +60,7 @@ class OwlRolePlaying(RolePlaying): assistant_agent_kwargs=self.assistant_agent_kwargs, user_agent_kwargs=self.user_agent_kwargs, output_language=self.output_language, - is_reasoning_task=self.is_reasoning_task + # is_reasoning_task=self.is_reasoning_task ) @@ -97,12 +97,12 @@ class OwlRolePlaying(RolePlaying): elif 'model' not in user_agent_kwargs: user_agent_kwargs.update(dict(model=self.model)) - # If the task is a reasoning task, the assistant agent should use the reasoning model O3-MINI - if is_reasoning_task: - assistant_agent_kwargs['model'] = ModelFactory.create( - model_platform=ModelPlatformType.OPENAI, - model_type=ModelType.O3_MINI, - ) + # # If the task is a reasoning task, the assistant agent should use the reasoning model O3-MINI + # if is_reasoning_task: + # assistant_agent_kwargs['model'] = ModelFactory.create( + # model_platform=ModelPlatformType.OPENAI, + # model_type=ModelType.O3_MINI, + # ) self.assistant_agent = ChatAgent( init_assistant_sys_msg, @@ -119,25 +119,25 @@ class OwlRolePlaying(RolePlaying): self.user_sys_msg = self.user_agent.system_message - def _judge_if_reasoning_task(self, question: str) -> bool: - r"""Judge if the question is a reasoning task.""" + # def _judge_if_reasoning_task(self, question: str) -> bool: + # r"""Judge if the question is a reasoning task.""" - LLM = OpenAIModel(model_type=ModelType.O3_MINI) - prompt = f""" - Please judge whether the following question is a reasoning or coding task, which can be solved by reasoning without leveraging external resources, or is suitable for writing code to solve the task. - If it is a reasoning or coding task, please return only "yes". - If it is not a reasoning or coding task, please return only "no". - Note: - - If the question required some world knowledge to answer the question, please carefully judge it, because the model's own knowledge is often unreliable. - - If it is suitable for writing codes (e.g. process excel files, write simulation codes, etc.), in most cases, it can be considered as a coding task. - Question: {question} - """ - messages = [{"role": "user", "content": prompt}] - resp = LLM.run(messages) - if 'yes' in resp.choices[0].message.content.lower(): - return True - else: - return False + # LLM = OpenAIModel(model_type=ModelType.O3_MINI) + # prompt = f""" + # Please judge whether the following question is a reasoning or coding task, which can be solved by reasoning without leveraging external resources, or is suitable for writing code to solve the task. + # If it is a reasoning or coding task, please return only "yes". + # If it is not a reasoning or coding task, please return only "no". + # Note: + # - If the question required some world knowledge to answer the question, please carefully judge it, because the model's own knowledge is often unreliable. + # - If it is suitable for writing codes (e.g. process excel files, write simulation codes, etc.), in most cases, it can be considered as a coding task. + # Question: {question} + # """ + # messages = [{"role": "user", "content": prompt}] + # resp = LLM.run(messages) + # if 'yes' in resp.choices[0].message.content.lower(): + # return True + # else: + # return False def _construct_gaia_sys_msgs(self):