enhance: support model platfroms not limited to openai

2025-12-26 10:07:51 +08:00 · 2025-03-07 19:06:40 +08:00 · 2025-03-07 19:06:40 +08:00 · 08f6dfb550
commit 08f6dfb550
parent bcb3002e59
8 changed files with 155 additions and 182 deletions
--- a/owl/camel/toolkits/document_processing_toolkit.py
+++ b/owl/camel/toolkits/document_processing_toolkit.py
@ -35,7 +35,7 @@ class DocumentProcessingToolkit(BaseToolkit):
    """
    def __init__(self, cache_dir: Optional[str] = None):
        self.image_tool = ImageAnalysisToolkit()
-        self.audio_tool = AudioAnalysisToolkit()
+        # self.audio_tool = AudioAnalysisToolkit()
        self.excel_tool = ExcelToolkit()

        self.cache_dir = "tmp/"
@ -59,9 +59,9 @@ class DocumentProcessingToolkit(BaseToolkit):
            res = self.image_tool.ask_question_about_image(document_path, "Please make a detailed caption about the image.")
            return True, res
        
-        if any(document_path.endswith(ext) for ext in ['.mp3', '.wav']):
-            res = self.audio_tool.ask_question_about_audio(document_path, "Please transcribe the audio content to text.")
-            return True, res
+        # if any(document_path.endswith(ext) for ext in ['.mp3', '.wav']):
+        #     res = self.audio_tool.ask_question_about_audio(document_path, "Please transcribe the audio content to text.")
+        #     return True, res
        
        if any(document_path.endswith(ext) for ext in ['xls', 'xlsx']):
            res = self.excel_tool.extract_excel_content(document_path)
--- a/owl/camel/toolkits/image_analysis_toolkit.py
+++ b/owl/camel/toolkits/image_analysis_toolkit.py
@ -15,7 +15,7 @@ import base64
 import logging
 import json
 from PIL import Image
-from typing import List, Literal, Tuple
+from typing import List, Literal, Tuple, Optional
 from urllib.parse import urlparse

 from camel.agents import ChatAgent
@ -23,7 +23,7 @@ from camel.configs import ChatGPTConfig
 from camel.toolkits.base import BaseToolkit
 from camel.toolkits import FunctionTool, CodeExecutionToolkit
 from camel.types import ModelType, ModelPlatformType
-from camel.models import ModelFactory, OpenAIModel
+from camel.models import ModelFactory, OpenAIModel, BaseModelBackend
 from camel.messages import BaseMessage

 logger = logging.getLogger(__name__)
@ -35,14 +35,8 @@ class ImageAnalysisToolkit(BaseToolkit):
    This class provides methods for understanding images, such as identifying
    objects, text in images.
    """
-    def __init__(self, model: Literal['gpt-4o', 'gpt-4o-mini'] = 'gpt-4o'):
-        self.model_type = ModelType.GPT_4O
-        if model == 'gpt-4o':
-            self.model_type = ModelType.GPT_4O
-        elif model == 'gpt-4o-mini':
-            self.model_type = ModelType.GPT_4O_MINI
-        else:
-            raise ValueError(f"Invalid model type: {model}")
+    def __init__(self, model: Optional[BaseModelBackend] = None):
+        self.model = model

    def _construct_image_url(self, image_path: str) -> str:
        parsed_url = urlparse(image_path)
@ -66,78 +60,78 @@ class ImageAnalysisToolkit(BaseToolkit):
            return base64.b64encode(image_file.read()).decode("utf-8")

    
-    def _judge_if_write_code(self, question: str, image_path: str) -> Tuple[bool, str]:
+    # def _judge_if_write_code(self, question: str, image_path: str) -> Tuple[bool, str]:

-        _image_url = self._construct_image_url(image_path)
+    #     _image_url = self._construct_image_url(image_path)
        
-        prompt = f"""
-        Given the question <question>{question}</question>, do you think it is suitable to write python code (using libraries like cv2) to process the image to get the answer?
-        Your output should be in json format (```json ```) including the following fields:
-        - `image_caption`: str, A detailed caption about the image. If it is suitable for writing code, it should contains helpful instructions and necessary informations for how to writing code.
-        - `if_write_code`: bool, True if it is suitable to write code to process the image, False otherwise.
-        """
+    #     prompt = f"""
+    #     Given the question <question>{question}</question>, do you think it is suitable to write python code (using libraries like cv2) to process the image to get the answer?
+    #     Your output should be in json format (```json ```) including the following fields:
+    #     - `image_caption`: str, A detailed caption about the image. If it is suitable for writing code, it should contains helpful instructions and necessary informations for how to writing code.
+    #     - `if_write_code`: bool, True if it is suitable to write code to process the image, False otherwise.
+    #     """

-        messages = [
-            {
-                "role": "system",
-                "content": "You are a helpful assistant for image relevant tasks, and can judge whether \
-                the given image is suitable for writing code to process or not. "
-            },
-            {
-                "role": "user",
-                "content": [
-                    {'type': 'text', 'text': prompt},
-                    {
-                        'type': 'image_url',
-                        'image_url': {
-                            'url': _image_url,
-                        },
-                    },
-                ],
-            },
-        ]
+    #     messages = [
+    #         {
+    #             "role": "system",
+    #             "content": "You are a helpful assistant for image relevant tasks, and can judge whether \
+    #             the given image is suitable for writing code to process or not. "
+    #         },
+    #         {
+    #             "role": "user",
+    #             "content": [
+    #                 {'type': 'text', 'text': prompt},
+    #                 {
+    #                     'type': 'image_url',
+    #                     'image_url': {
+    #                         'url': _image_url,
+    #                     },
+    #                 },
+    #             ],
+    #         },
+    #     ]

-        LLM = OpenAIModel(model_type=self.model_type)
-        resp = LLM.run(messages) 
+    #     LLM = OpenAIModel(model_type=self.model_type)
+    #     resp = LLM.run(messages) 

-        result_str = resp.choices[0].message.content.lower()
-        result_str = result_str.replace("```json", "").replace("```", "").strip()
+    #     result_str = resp.choices[0].message.content.lower()
+    #     result_str = result_str.replace("```json", "").replace("```", "").strip()

-        result_dict = json.loads(result_str)
+    #     result_dict = json.loads(result_str)

-        if_write_code = result_dict.get("if_write_code", False)
-        image_caption = result_dict.get("image_caption", "")
+    #     if_write_code = result_dict.get("if_write_code", False)
+    #     image_caption = result_dict.get("image_caption", "")

-        return if_write_code, image_caption
+    #     return if_write_code, image_caption
    

-    def _get_image_caption(self, image_path: str) -> str:
+    # def _get_image_caption(self, image_path: str) -> str:

-        _image_url = self._construct_image_url(image_path)
+    #     _image_url = self._construct_image_url(image_path)
        
-        prompt = f"""
-        Please make a detailed description about the image.
-        """
+    #     prompt = f"""
+    #     Please make a detailed description about the image.
+    #     """

-        messages = [
-            {
-                "role": "user",
-                "content": [
-                    {'type': 'text', 'text': prompt},
-                    {
-                        'type': 'image_url',
-                        'image_url': {
-                            'url': _image_url,
-                        },
-                    },
-                ],
-            },
-        ]
+    #     messages = [
+    #         {
+    #             "role": "user",
+    #             "content": [
+    #                 {'type': 'text', 'text': prompt},
+    #                 {
+    #                     'type': 'image_url',
+    #                     'image_url': {
+    #                         'url': _image_url,
+    #                     },
+    #                 },
+    #             ],
+    #         },
+    #     ]

-        LLM = OpenAIModel(model_type=self.model_type)
-        resp = LLM.run(messages) 
+    #     LLM = OpenAIModel(model_type=self.model_type)
+    #     resp = LLM.run(messages) 

-        return resp.choices[0].message.content
+    #     return resp.choices[0].message.content


    def ask_question_about_image(self, image_path: str, question: str) -> str:
@ -175,28 +169,24 @@ class ImageAnalysisToolkit(BaseToolkit):
        #         f"data:image/jpeg;base64,{self._encode_image(image_path)}"
        #     )

-        model = ModelFactory.create(
-            model_platform=ModelPlatformType.OPENAI,
-            model_type=self.model_type,
-        )

-        code_model = ModelFactory.create(
-            model_platform=ModelPlatformType.OPENAI,
-            model_type=ModelType.O3_MINI,
-        )
+        # code_model = ModelFactory.create(
+        #     model_platform=ModelPlatformType.OPENAI,
+        #     model_type=ModelType.O3_MINI,
+        # )

-        code_execution_toolkit = CodeExecutionToolkit(require_confirm=False, sandbox="subprocess", verbose=True)
+        # code_execution_toolkit = CodeExecutionToolkit(require_confirm=False, sandbox="subprocess", verbose=True)

        image_agent = ChatAgent(
            "You are a helpful assistant for image relevant tasks. Given a question related to the image, you can carefully check the image in detail and answer the question.",
-            model,
+            self.model,
        )

-        code_agent = ChatAgent(
-            "You are an expert of writing code to process special images leveraging libraries like cv2.",
-            code_model,
-            tools=code_execution_toolkit.get_tools(),
-        )
+        # code_agent = ChatAgent(
+        #     "You are an expert of writing code to process special images leveraging libraries like cv2.",
+        #     code_model,
+        #     tools=code_execution_toolkit.get_tools(),
+        # )

        if not is_url:
            image_object = Image.open(image_path)
--- a/owl/camel/toolkits/search_toolkit.py
+++ b/owl/camel/toolkits/search_toolkit.py
@ -26,6 +26,7 @@ from retry import retry
 from camel.toolkits.base import BaseToolkit
 from camel.toolkits import FunctionTool
 from camel.messages import BaseMessage
+from camel.models import BaseModelBackend
 from camel.agents import ChatAgent
 from camel.models import ModelFactory
 from camel.types import ModelType, ModelPlatformType
@ -37,6 +38,9 @@ class SearchToolkit(BaseToolkit):
    search engines like Google, DuckDuckGo, Wikipedia and Wolfram Alpha, Brave.
    """

+    def __init__(self, model: Optional[BaseModelBackend] = None):
+        self.model = model
+
    @dependencies_required("wikipedia")
    @retry(ConnectionError, delay=3)
    def search_wiki(self, entity: str) -> str:
@ -698,15 +702,9 @@ class SearchToolkit(BaseToolkit):
            The search result containing url and necessary information.
        """

-        model = ModelFactory.create(
-            model_type=ModelType.GPT_4O_MINI,
-            model_platform=ModelPlatformType.OPENAI,
-            model_config_dict={"temperature": 0, "top_p": 1}
-        )
-
        search_agent = ChatAgent(
            "You are a helpful search agent.",
-            model=model,
+            model=self.model,
            tools=[FunctionTool(self.search_wiki), FunctionTool(self.search_google), FunctionTool(self.search_archived_webpage)]
        )

--- a/owl/camel/toolkits/web_toolkit.py
+++ b/owl/camel/toolkits/web_toolkit.py
@ -14,7 +14,7 @@ from camel.toolkits.base import BaseToolkit
 from camel.toolkits import FunctionTool, VideoAnalysisToolkit
 from camel.messages import BaseMessage
 from camel.agents import ChatAgent
-from camel.models import ModelFactory
+from camel.models import ModelFactory, BaseModelBackend
 from camel.types import ModelType, ModelPlatformType

 import io
@ -717,8 +717,9 @@ class WebToolkit(BaseToolkit):
                 headless=True,
                 cache_dir: Optional[str] = None,
                 page_script_path: Optional[str] = None,
-                 model: Literal['gpt-4o', 'gpt-4o-mini'] = 'gpt-4o',
-                 history_window: int = 5
+                 history_window: int = 5,
+                 web_agent_model: Optional[BaseModelBackend] = None,
+                 planning_agent_model: Optional[BaseModelBackend] = None,
                 ): 
        
        self.browser = BaseBrowser(
@ -728,10 +729,12 @@ class WebToolkit(BaseToolkit):
            )
        
        self.history_window = history_window
+        self.web_agent_model = web_agent_model
+        self.planning_agent_model = planning_agent_model
        
        self.history = []
        # self.search_toolkit = SearchToolkit()
-        self.web_agent, self.planning_agent = self._initialize_agent(model)
+        self.web_agent, self.planning_agent = self._initialize_agent()
        
    
    def _reset(self):
@ -741,28 +744,24 @@ class WebToolkit(BaseToolkit):
        os.makedirs(self.browser.cache_dir, exist_ok=True)
    
    
-    def _initialize_agent(self, model: Literal['gpt-4o', 'gpt-4o-mini']) -> Tuple[ChatAgent, ChatAgent]:
+    def _initialize_agent(self) -> Tuple[ChatAgent, ChatAgent]:
        r"""Initialize the agent."""
-        if model == 'gpt-4o':
+        if self.web_agent_model is None:
            web_agent_model = ModelFactory.create(
                model_platform=ModelPlatformType.OPENAI,
                model_type=ModelType.GPT_4O,
                model_config_dict={"temperature": 0, "top_p": 1}
            )
-        elif model == 'gpt-4o-mini':
-            web_agent_model = ModelFactory.create(
+        else:
+            web_agent_model = self.web_agent_model
+
+        if self.planning_agent_model is None:
+            planning_model = ModelFactory.create(
                model_platform=ModelPlatformType.OPENAI,
-                model_type=ModelType.GPT_4O_MINI,
-                model_config_dict={"temperature": 0, "top_p": 1}
+                model_type=ModelType.O3_MINI,
            )
        else:
-            raise ValueError("Invalid model type.")
-        
-        planning_model = ModelFactory.create(
-            model_platform=ModelPlatformType.OPENAI,
-            model_type=ModelType.O3_MINI,
-        )
-        
+            planning_model = self.planning_agent_model
        
        system_prompt = """
 You are a helpful web agent that can assist users in browsing the web.
--- a/owl/camel/types/enums.py
+++ b/owl/camel/types/enums.py
@ -149,6 +149,7 @@ class ModelType(UnifiedModelType, Enum):
    QWEN_2_5_32B = "qwen2.5-32b-instruct"
    QWEN_2_5_14B = "qwen2.5-14b-instruct"
    QWEN_QWQ_32B = "qwq-32b-preview"
+    QWEN_OMNI_TURBO = "qwen-omni-turbo"

    # Yi models (01-ai)
    YI_LIGHTNING = "yi-lightning"
@ -404,6 +405,7 @@ class ModelType(UnifiedModelType, Enum):
            ModelType.QWEN_2_5_32B,
            ModelType.QWEN_2_5_14B,
            ModelType.QWEN_QWQ_32B,
+            ModelType.QWEN_OMNI_TURBO,
        }

    @property
@ -502,6 +504,7 @@ class ModelType(UnifiedModelType, Enum):
            ModelType.INTERNLM2_PRO_CHAT,
            ModelType.TOGETHER_MIXTRAL_8_7B,
            ModelType.SGLANG_MISTRAL_7B,
+            ModelType.QWEN_OMNI_TURBO,
        }:
            return 32_768
        elif self in {
--- a/owl/run.py
+++ b/owl/run.py
@ -8,7 +8,7 @@ from dotenv import load_dotenv
 from retry import retry
 from loguru import logger

-from utils import OwlRolePlaying, process_tools, run_society
+from utils import OwlRolePlaying, run_society
 import os


@ -32,30 +32,21 @@ def construct_society(question: str) -> OwlRolePlaying:
        model_type=ModelType.GPT_4O,
        model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), # [Optional] the config for model
    )
- 
-    
-    user_tools = []
-    assistant_tools = [
-        "WebToolkit",
-        'DocumentProcessingToolkit', 
-        'VideoAnalysisToolkit', 
-        'CodeExecutionToolkit', 
-        'ImageAnalysisToolkit', 
-        'AudioAnalysisToolkit', 
-        "SearchToolkit",
-        "ExcelToolkit",
-        ]
+
+    tools_list = [*WebToolkit(web_agent_model=assistant_model, planning_agent_model=assistant_model).get_tools(),
+    *DocumentProcessingToolkit().get_tools(),
+    *VideoAnalysisToolkit().get_tools(), # This requires OpenAI and Qwen Key
+    *CodeExecutionToolkit().get_tools(),
+    *ImageAnalysisToolkit(model=assistant_model).get_tools(),
+    *AudioAnalysisToolkit().get_tools(), # This requires OpenAI Key
+    *SearchToolkit(model=assistant_model).get_tools(),
+    *ExcelToolkit().get_tools()]

    user_role_name = 'user'
-    user_agent_kwargs = {
-        'model': user_model,
-        'tools': process_tools(user_tools),
-    }
+    user_agent_kwargs = dict(model=user_model)
    assistant_role_name = 'assistant'
-    assistant_agent_kwargs = {
-        'model': assistant_model,
-        'tools': process_tools(assistant_tools),
-    }
+    assistant_agent_kwargs = dict(model=assistant_model,
+    tools=tools_list)
    
    task_kwargs = {
        'task_prompt': question,
--- a/owl/run_gaia_roleplaying.py
+++ b/owl/run_gaia_roleplaying.py
@ -2,7 +2,7 @@ from camel.models import ModelFactory
 from camel.toolkits import *
 from camel.types import ModelPlatformType, ModelType
 from camel.configs import ChatGPTConfig
-from utils import GAIABenchmark, process_tools
+from utils import GAIABenchmark

 from dotenv import load_dotenv
 from retry import retry
@ -36,28 +36,20 @@ def main():
        model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), # [Optional] the config for model
    )

-    user_tools = []
-    assistant_tools = [
-        "WebToolkit",
-        'DocumentProcessingToolkit', 
-        'VideoAnalysisToolkit', 
-        'CodeExecutionToolkit', 
-        'ImageAnalysisToolkit', 
-        'AudioAnalysisToolkit', 
-        "SearchToolkit",
-        "ExcelToolkit",
-        ]
+    tools_list = [*WebToolkit(web_agent_model=assistant_model, planning_agent_model=assistant_model).get_tools(),
+    *DocumentProcessingToolkit().get_tools(),
+    *VideoAnalysisToolkit().get_tools(), # This requires OpenAI and Qwen Key
+    *CodeExecutionToolkit().get_tools(),
+    *ImageAnalysisToolkit(model=assistant_model).get_tools(),
+    *AudioAnalysisToolkit().get_tools(), # This requires OpenAI Key
+    *SearchToolkit(model=assistant_model).get_tools(),
+    *ExcelToolkit().get_tools()]

    user_role_name = 'user'
-    user_agent_kwargs = {
-        'model': user_model,
-        'tools': process_tools(user_tools),
-    }
+    user_agent_kwargs = dict(model=user_model)
    assistant_role_name = 'assistant'
-    assistant_agent_kwargs = {
-        'model': assistant_model,
-        'tools': process_tools(assistant_tools),
-    }
+    assistant_agent_kwargs = dict(model=assistant_model,
+    tools=tools_list)

    benchmark = GAIABenchmark(
        data_dir="data/gaia",
--- a/owl/utils/enhanced_role_playing.py
+++ b/owl/utils/enhanced_role_playing.py
@ -47,12 +47,12 @@ class OwlRolePlaying(RolePlaying):
        self.assistant_sys_msg: Optional[BaseMessage]
        self.user_sys_msg: Optional[BaseMessage]

-        self.is_reasoning_task = self._judge_if_reasoning_task(self.task_prompt)
+        # self.is_reasoning_task = self._judge_if_reasoning_task(self.task_prompt)

-        if self.is_reasoning_task:
-            logger.info("The task is judged as a reasoning or coding task. The assistant agent will use the reasoning model O3-MINI.")
-        else:
-            logger.info("The assistant agent will use the default model.")
+        # if self.is_reasoning_task:
+        #     logger.info("The task is judged as a reasoning or coding task. The assistant agent will use the reasoning model O3-MINI.")
+        # else:
+        #     logger.info("The assistant agent will use the default model.")
        
        self._init_agents(
            init_assistant_sys_msg,
@ -60,7 +60,7 @@ class OwlRolePlaying(RolePlaying):
            assistant_agent_kwargs=self.assistant_agent_kwargs,
            user_agent_kwargs=self.user_agent_kwargs,
            output_language=self.output_language,
-            is_reasoning_task=self.is_reasoning_task
+            # is_reasoning_task=self.is_reasoning_task
        )
        
        
@ -97,12 +97,12 @@ class OwlRolePlaying(RolePlaying):
            elif 'model' not in user_agent_kwargs:
                user_agent_kwargs.update(dict(model=self.model))
        
-        # If the task is a reasoning task, the assistant agent should use the reasoning model O3-MINI
-        if is_reasoning_task:
-            assistant_agent_kwargs['model'] = ModelFactory.create(
-                model_platform=ModelPlatformType.OPENAI,
-                model_type=ModelType.O3_MINI,
-            )
+        # # If the task is a reasoning task, the assistant agent should use the reasoning model O3-MINI
+        # if is_reasoning_task:
+        #     assistant_agent_kwargs['model'] = ModelFactory.create(
+        #         model_platform=ModelPlatformType.OPENAI,
+        #         model_type=ModelType.O3_MINI,
+        #     )

        self.assistant_agent = ChatAgent(
            init_assistant_sys_msg,
@ -119,25 +119,25 @@ class OwlRolePlaying(RolePlaying):
        self.user_sys_msg = self.user_agent.system_message
        
    
-    def _judge_if_reasoning_task(self, question: str) -> bool:
-        r"""Judge if the question is a reasoning task."""
+    # def _judge_if_reasoning_task(self, question: str) -> bool:
+    #     r"""Judge if the question is a reasoning task."""
        
-        LLM = OpenAIModel(model_type=ModelType.O3_MINI)
-        prompt = f"""
-        Please judge whether the following question is a reasoning or coding task, which can be solved by reasoning without leveraging external resources, or is suitable for writing code to solve the task.
-        If it is a reasoning or coding task, please return only "yes".
-        If it is not a reasoning or coding task, please return only "no".
-        Note: 
-        - If the question required some world knowledge to answer the question, please carefully judge it, because the model's own knowledge is often unreliable.
-        - If it is suitable for writing codes (e.g. process excel files, write simulation codes, etc.), in most cases, it can be considered as a coding task.
-        Question: <question>{question}</question>
-        """
-        messages = [{"role": "user", "content": prompt}]
-        resp = LLM.run(messages)
-        if 'yes' in resp.choices[0].message.content.lower():
-            return True
-        else:
-            return False
+    #     LLM = OpenAIModel(model_type=ModelType.O3_MINI)
+    #     prompt = f"""
+    #     Please judge whether the following question is a reasoning or coding task, which can be solved by reasoning without leveraging external resources, or is suitable for writing code to solve the task.
+    #     If it is a reasoning or coding task, please return only "yes".
+    #     If it is not a reasoning or coding task, please return only "no".
+    #     Note: 
+    #     - If the question required some world knowledge to answer the question, please carefully judge it, because the model's own knowledge is often unreliable.
+    #     - If it is suitable for writing codes (e.g. process excel files, write simulation codes, etc.), in most cases, it can be considered as a coding task.
+    #     Question: <question>{question}</question>
+    #     """
+    #     messages = [{"role": "user", "content": prompt}]
+    #     resp = LLM.run(messages)
+    #     if 'yes' in resp.choices[0].message.content.lower():
+    #         return True
+    #     else:
+    #         return False
        

    def _construct_gaia_sys_msgs(self):