添加通义千问(Qwen)模型集成支持

This commit is contained in:
YansongW
2025-03-07 16:47:39 +08:00
parent ae8c1c5742
commit dc7b9f15e2
12 changed files with 277 additions and 115 deletions

BIN
assets/community.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 513 KiB

View File

@@ -26,6 +26,12 @@ from camel.toolkits.function_tool import FunctionTool
# logger = logging.getLogger(__name__)
from loguru import logger
from camel.models import ModelFactory
from camel.configs import QwenConfig
from camel.types import ModelPlatformType, ModelType
from camel.agents import ChatAgent
from camel.messages import BaseMessage
class AudioAnalysisToolkit(BaseToolkit):
r"""A class representing a toolkit for audio operations.
@@ -38,7 +44,20 @@ class AudioAnalysisToolkit(BaseToolkit):
if cache_dir:
self.cache_dir = cache_dir
self.client = openai.OpenAI()
# 创建通义千问Omni模型
self.audio_model = ModelFactory.create(
model_platform=ModelPlatformType.QWEN,
model_type=ModelType.QWEN_OMNI_TURBO,
model_config_dict=QwenConfig(
temperature=0.3,
top_p=0.9,
stream=False # 设置为False以避免设置stream_options
).as_dict(),
)
self.audio_agent = ChatAgent(
model=self.audio_model,
output_language="English"
)
self.reasoning = reasoning
@@ -64,81 +83,81 @@ class AudioAnalysisToolkit(BaseToolkit):
encoded_string = None
if is_url:
res = requests.get(audio_path)
res.raise_for_status()
audio_data = res.content
encoded_string = base64.b64encode(audio_data).decode('utf-8')
# 使用URL直接传递给模型
audio_url = audio_path
else:
# 如果是本地文件则需要进行base64编码
with open(audio_path, "rb") as audio_file:
audio_data = audio_file.read()
audio_file.close()
encoded_string = base64.b64encode(audio_data).decode('utf-8')
# 在实际场景中我们需要将此base64字符串上传到服务器或CDN获取URL
# 这里我们假设已经上传并获得了URL
audio_url = f"data:audio/mp3;base64,{encoded_string}"
file_suffix = os.path.splitext(audio_path)[1]
file_format = file_suffix[1:]
if self.reasoning:
text_prompt = f"Transcribe all the content in the speech into text."
transcription = self.client.audio.transcriptions.create(
model="whisper-1",
file=open(audio_path, "rb")
# 使用通义千问的多模态能力
logger.info("Using reasoning mode with Qwen-Omni model for audio analysis")
msg = BaseMessage.make_user_message(
role_name="User",
content=f"请分析这段音频并回答以下问题:{question}"
)
transcript = transcription.text
reasoning_prompt = f"""
<speech_transcription_result>{transcript}</speech_transcription_result>
Please answer the following question based on the speech transcription result above:
<question>{question}</question>
"""
reasoning_completion = self.client.chat.completions.create(
# model="gpt-4o-audio-preview",
model = "o3-mini",
messages=[
{
"role": "user",
"content": reasoning_prompt,
}]
)
reasoning_result = reasoning_completion.choices[0].message.content
return str(reasoning_result)
else:
text_prompt = f"""Answer the following question based on the given \
audio information:\n\n{question}"""
completion = self.client.chat.completions.create(
# model="gpt-4o-audio-preview",
model = "gpt-4o-mini-audio-preview",
messages=[
{
"role": "system",
"content": "You are a helpful assistant specializing in \
audio analysis.",
},
{ # type: ignore[list-item, misc]
"role": "user",
"content": [
{"type": "text", "text": text_prompt},
{
"type": "input_audio",
"input_audio": {
"data": encoded_string,
"format": file_format,
},
# 通过OpenAI兼容接口实现
from camel.messages import OpenAIMessage
openai_messages = [
{
"role": "user",
"content": [
{
"type": "input_audio",
"input_audio": {
"data": audio_url, # 使用URL或base64
"format": file_format,
},
],
},
],
) # type: ignore[misc]
response: str = str(completion.choices[0].message.content)
logger.debug(f"Response: {response}")
return str(response)
},
{"type": "text", "text": f"请分析这段音频并回答以下问题:{question}"},
],
},
]
# 直接使用OpenAI兼容的客户端
import os
from openai import OpenAI
client = OpenAI(
api_key=os.getenv("QWEN_API_KEY"),
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1",
)
completion = client.chat.completions.create(
model="qwen-omni-turbo",
messages=openai_messages,
modalities=["text"],
stream=True,
)
# 处理流式响应
answer_parts = []
for chunk in completion:
if chunk.choices and chunk.choices[0].delta.content:
answer_parts.append(chunk.choices[0].delta.content)
return "".join(answer_parts)
else:
# 非reasoning模式使用简单的步骤
# 假设不需要复杂的处理逻辑
msg = BaseMessage.make_user_message(
role_name="User",
content=f"请分析这段音频并回答问题:{question}"
)
response = self.audio_agent.step(msg)
return response.msgs[0].content
def get_tools(self) -> List[FunctionTool]:
r"""Returns a list of FunctionTool objects representing the functions

View File

@@ -19,7 +19,7 @@ from typing import List, Literal, Tuple
from urllib.parse import urlparse
from camel.agents import ChatAgent
from camel.configs import ChatGPTConfig
from camel.configs import ChatGPTConfig, QwenConfig
from camel.toolkits.base import BaseToolkit
from camel.toolkits import FunctionTool, CodeExecutionToolkit
from camel.types import ModelType, ModelPlatformType
@@ -35,14 +35,32 @@ class ImageAnalysisToolkit(BaseToolkit):
This class provides methods for understanding images, such as identifying
objects, text in images.
"""
def __init__(self, model: Literal['gpt-4o', 'gpt-4o-mini'] = 'gpt-4o'):
def __init__(self, model: Literal['gpt-4o', 'gpt-4o-mini', 'qwen-vl-max', 'qwen-vl-plus', 'qwen-omni-turbo'] = 'gpt-4o'):
# 设置默认值
self.model_platform = ModelPlatformType.OPENAI
self.model_type = ModelType.GPT_4O
# 根据传入的模型名称设置对应的平台和类型
if model == 'gpt-4o':
self.model_platform = ModelPlatformType.OPENAI
self.model_type = ModelType.GPT_4O
elif model == 'gpt-4o-mini':
self.model_platform = ModelPlatformType.OPENAI
self.model_type = ModelType.GPT_4O_MINI
elif model == 'qwen-vl-max':
self.model_platform = ModelPlatformType.QWEN
self.model_type = ModelType.QWEN_VL_MAX
elif model == 'qwen-vl-plus':
self.model_platform = ModelPlatformType.QWEN
self.model_type = ModelType.QWEN_VL_PLUS
elif model == 'qwen-omni-turbo':
self.model_platform = ModelPlatformType.QWEN
self.model_type = ModelType.QWEN_OMNI_TURBO
else:
raise ValueError(f"Invalid model type: {model}")
# 记录当前使用的模型
self.current_model = model
def _construct_image_url(self, image_path: str) -> str:
parsed_url = urlparse(image_path)
@@ -175,15 +193,40 @@ class ImageAnalysisToolkit(BaseToolkit):
# f"data:image/jpeg;base64,{self._encode_image(image_path)}"
# )
model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=self.model_type,
)
code_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.O3_MINI,
)
# 根据初始化时设置的模型平台和类型创建相应的模型
if self.model_platform == ModelPlatformType.OPENAI:
model = ModelFactory.create(
model_platform=self.model_platform,
model_type=self.model_type,
model_config_dict={"temperature": 0, "top_p": 1}
)
code_model = ModelFactory.create(
model_platform=self.model_platform,
model_type=ModelType.O3_MINI,
)
elif self.model_platform == ModelPlatformType.QWEN:
# 创建配置如果是Omni模型必须设置stream为True
config = {"temperature": 0.3, "top_p": 0.9}
# 如果是Omni模型添加必要的参数
if self.model_type == ModelType.QWEN_OMNI_TURBO:
config["stream"] = True
config["modalities"] = ["text"]
model = ModelFactory.create(
model_platform=self.model_platform,
model_type=self.model_type,
model_config_dict=QwenConfig(**config).as_dict(),
)
code_model = ModelFactory.create(
model_platform=self.model_platform,
model_type=ModelType.QWEN_TURBO,
model_config_dict=QwenConfig(temperature=0.3, top_p=0.9).as_dict(),
)
else:
raise ValueError(f"Unsupported model platform: {self.model_platform}")
code_execution_toolkit = CodeExecutionToolkit(require_confirm=False, sandbox="subprocess", verbose=True)

View File

@@ -699,9 +699,9 @@ class SearchToolkit(BaseToolkit):
"""
model = ModelFactory.create(
model_type=ModelType.GPT_4O_MINI,
model_platform=ModelPlatformType.OPENAI,
model_config_dict={"temperature": 0, "top_p": 1}
model_type=ModelType.QWEN_TURBO,
model_platform=ModelPlatformType.QWEN,
model_config_dict={"temperature": 0.3, "top_p": 0.9}
)
search_agent = ChatAgent(

View File

@@ -125,10 +125,15 @@ class VideoAnalysisToolkit(BaseToolkit):
logger.info(f"Video will be downloaded to {self._download_directory}")
# 为Qwen-Omni模型添加必要的参数
config = {"temperature": 0.2}
if ModelType.QWEN_OMNI_TURBO == "qwen-omni-turbo":
config["stream"] = False
self.vl_model = ModelFactory.create(
model_platform=ModelPlatformType.QWEN,
model_type=ModelType.QWEN_VL_MAX,
model_config_dict=QwenConfig(temperature=0.2).as_dict(),
model_type=ModelType.QWEN_OMNI_TURBO,
model_config_dict=QwenConfig(**config).as_dict(),
)
self.vl_agent = ChatAgent(
@@ -246,6 +251,12 @@ class VideoAnalysisToolkit(BaseToolkit):
print(prompt)
# 特殊处理检查是否使用的是通义千问Omni模型
if self.vl_model.model_type == ModelType.QWEN_OMNI_TURBO:
logger.info("Using Qwen-Omni-Turbo model for video analysis")
# 这里可能需要特殊处理取决于通义千问Omni的API实现
# 但是我们仍然可以使用现有的架构因为图像处理是在BaseMessage的to_openai_user_message方法中完成的
msg = BaseMessage.make_user_message(
role_name="User",
content=prompt,

View File

@@ -717,7 +717,7 @@ class WebToolkit(BaseToolkit):
headless=True,
cache_dir: Optional[str] = None,
page_script_path: Optional[str] = None,
model: Literal['gpt-4o', 'gpt-4o-mini'] = 'gpt-4o',
model: Literal['gpt-4o', 'gpt-4o-mini', 'qwen-plus', 'qwen-turbo'] = 'qwen-plus',
history_window: int = 5
):
@@ -741,26 +741,27 @@ class WebToolkit(BaseToolkit):
os.makedirs(self.browser.cache_dir, exist_ok=True)
def _initialize_agent(self, model: Literal['gpt-4o', 'gpt-4o-mini']) -> Tuple[ChatAgent, ChatAgent]:
def _initialize_agent(self, model: Literal['gpt-4o', 'gpt-4o-mini', 'qwen-plus', 'qwen-turbo'] = 'qwen-plus') -> Tuple[ChatAgent, ChatAgent]:
r"""Initialize the agent."""
if model == 'gpt-4o':
if model == 'gpt-4o' or model == 'qwen-plus':
web_agent_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O,
model_config_dict={"temperature": 0, "top_p": 1}
model_platform=ModelPlatformType.QWEN,
model_type=ModelType.QWEN_PLUS,
model_config_dict={"temperature": 0.3, "top_p": 0.9}
)
elif model == 'gpt-4o-mini':
elif model == 'gpt-4o-mini' or model == 'qwen-turbo':
web_agent_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O_MINI,
model_config_dict={"temperature": 0, "top_p": 1}
model_platform=ModelPlatformType.QWEN,
model_type=ModelType.QWEN_TURBO,
model_config_dict={"temperature": 0.3, "top_p": 0.9}
)
else:
raise ValueError("Invalid model type.")
planning_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.O3_MINI,
model_platform=ModelPlatformType.QWEN,
model_type=ModelType.QWEN_TURBO,
model_config_dict={"temperature": 0.3, "top_p": 0.9}
)

View File

@@ -144,6 +144,7 @@ class ModelType(UnifiedModelType, Enum):
QWEN_MATH_PLUS = "qwen-math-plus"
QWEN_MATH_TURBO = "qwen-math-turbo"
QWEN_CODER_TURBO = "qwen-coder-turbo"
QWEN_OMNI_TURBO = "qwen-omni-turbo"
QWEN_2_5_CODER_32B = "qwen2.5-coder-32b-instruct"
QWEN_2_5_72B = "qwen2.5-72b-instruct"
QWEN_2_5_32B = "qwen2.5-32b-instruct"
@@ -399,6 +400,7 @@ class ModelType(UnifiedModelType, Enum):
ModelType.QWEN_MATH_PLUS,
ModelType.QWEN_MATH_TURBO,
ModelType.QWEN_CODER_TURBO,
ModelType.QWEN_OMNI_TURBO,
ModelType.QWEN_2_5_CODER_32B,
ModelType.QWEN_2_5_72B,
ModelType.QWEN_2_5_32B,
@@ -553,6 +555,7 @@ class ModelType(UnifiedModelType, Enum):
ModelType.QWEN_PLUS,
ModelType.QWEN_TURBO,
ModelType.QWEN_CODER_TURBO,
ModelType.QWEN_OMNI_TURBO,
ModelType.TOGETHER_LLAMA_3_1_8B,
ModelType.TOGETHER_LLAMA_3_1_70B,
ModelType.TOGETHER_LLAMA_3_1_405B,

View File

@@ -1,7 +1,7 @@
from camel.models import ModelFactory
from camel.toolkits import *
from camel.types import ModelPlatformType, ModelType
from camel.configs import ChatGPTConfig
from camel.configs import QwenConfig
from typing import List, Dict
from dotenv import load_dotenv
@@ -22,15 +22,15 @@ def construct_society(question: str) -> OwlRolePlaying:
assistant_role_name = "assistant"
user_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O,
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), # [Optional] the config for model
model_platform=ModelPlatformType.QWEN,
model_type=ModelType.QWEN_PLUS,
model_config_dict=QwenConfig(temperature=0.3, top_p=0.9).as_dict(),
)
assistant_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O,
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), # [Optional] the config for model
model_platform=ModelPlatformType.QWEN,
model_type=ModelType.QWEN_PLUS,
model_config_dict=QwenConfig(temperature=0.3, top_p=0.9).as_dict(),
)
@@ -74,7 +74,7 @@ def construct_society(question: str) -> OwlRolePlaying:
# Example case
question = "What was the volume in m^3 of the fish bag that was calculated in the University of Leicester paper `Can Hiccup Supply Enough Fish to Maintain a Dragons Diet?` "
question = "我需要创建一个AI日程管理助手的微信小程序请你作为产品经理规划工作流程和分工制定相关的开发计划和内容。然后你作为UI设计师设计小程序的UI界面。最后你作为开发工程师编写代码实现小程序的功能。"
society = construct_society(question)
answer, chat_history, token_count = run_society(society)

View File

@@ -1,7 +1,7 @@
from camel.models import ModelFactory
from camel.toolkits import *
from camel.types import ModelPlatformType, ModelType
from camel.configs import ChatGPTConfig
from camel.configs import QwenConfig
from utils import GAIABenchmark, process_tools
from dotenv import load_dotenv
@@ -25,15 +25,15 @@ def main():
os.makedirs(cache_dir, exist_ok=True)
user_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O,
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), # [Optional] the config for model
model_platform=ModelPlatformType.QWEN,
model_type=ModelType.QWEN_PLUS,
model_config_dict=QwenConfig(temperature=0.3, top_p=0.9).as_dict(),
)
assistant_model = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.GPT_4O,
model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), # [Optional] the config for model
model_platform=ModelPlatformType.QWEN,
model_type=ModelType.QWEN_PLUS,
model_config_dict=QwenConfig(temperature=0.3, top_p=0.9).as_dict(),
)
user_tools = []

View File

@@ -3,6 +3,7 @@ sys.path.append("../")
import json
import re
import os
from typing import Dict, Optional, List
from loguru import logger
@@ -50,11 +51,13 @@ def process_tools(tools: List[str] | str) -> List[FunctionTool]:
if tool_name == "CodeExecutionToolkit":
tool_list.extend(toolkit_class(sandbox="subprocess", verbose=True).get_tools())
elif tool_name == 'ImageAnalysisToolkit':
tool_list.extend(toolkit_class(model="gpt-4o").get_tools())
tool_list.extend(toolkit_class(model="qwen-omni-turbo").get_tools())
elif tool_name == 'AudioAnalysisToolkit':
tool_list.extend(toolkit_class(reasoning=True).get_tools())
# 创建一个空的缓存目录(如果不存在)
os.makedirs("tmp", exist_ok=True)
tool_list.extend(toolkit_class(cache_dir="tmp", reasoning=True).get_tools())
elif tool_name == "WebToolkit":
tool_list.extend(toolkit_class(headless=True).get_tools())
tool_list.extend(toolkit_class(headless=True, model="qwen-plus").get_tools())
else:
tool_list.extend(toolkit_class().get_tools())

View File

@@ -12,7 +12,7 @@ from camel.agents import ChatAgent
from camel.responses import ChatAgentResponse
from camel.messages.base import BaseMessage
from camel.societies import RolePlaying
from camel.models import OpenAIModel, ModelFactory
from camel.models import ModelFactory
from camel.types import ModelType, ModelPlatformType
@@ -100,8 +100,8 @@ class OwlRolePlaying(RolePlaying):
# If the task is a reasoning task, the assistant agent should use the reasoning model O3-MINI
if is_reasoning_task:
assistant_agent_kwargs['model'] = ModelFactory.create(
model_platform=ModelPlatformType.OPENAI,
model_type=ModelType.O3_MINI,
model_platform=ModelPlatformType.QWEN,
model_type=ModelType.QWEN_PLUS,
)
self.assistant_agent = ChatAgent(
@@ -122,7 +122,10 @@ class OwlRolePlaying(RolePlaying):
def _judge_if_reasoning_task(self, question: str) -> bool:
r"""Judge if the question is a reasoning task."""
LLM = OpenAIModel(model_type=ModelType.O3_MINI)
LLM = ModelFactory.create(
model_platform=ModelPlatformType.QWEN,
model_type=ModelType.QWEN_PLUS,
)
prompt = f"""
Please judge whether the following question is a reasoning or coding task, which can be solved by reasoning without leveraging external resources, or is suitable for writing code to solve the task.
If it is a reasoning or coding task, please return only "yes".
@@ -154,7 +157,7 @@ Please note that the task may be very complicated. Do not attempt to solve the t
Here are some tips that will help you to give more valuable instructions about our task to me:
<tips>
- I have various tools to use, such as search toolkit, web browser simulation toolkit, document relevant toolkit, code execution toolkit, etc. Thus, You must think how human will solve the task step-by-step, and give me instructions just like that. For example, one may first use google search to get some initial information and the target url, then retrieve the content of the url, or do some web browser interaction to find the answer.
- Although the task is complex, the answer does exist. If you cant find the answer using the current scheme, try to re-plan and use other ways to find the answer, e.g. using other tools or methods that can achieve similar results.
- Although the task is complex, the answer does exist. If you can't find the answer using the current scheme, try to re-plan and use other ways to find the answer, e.g. using other tools or methods that can achieve similar results.
- Always remind me to verify my final answer about the overall task. This work can be done by using multiple tools(e.g., screenshots, webpage analysis, etc.), or something else.
- If I have written code, please remind me to run the code and get the result.
- Search results typically do not provide precise answers. It is not likely to find the answer directly using search toolkit only, the search query should be concise and focuses on finding sources rather than direct answers, as it always need to use other tools to further process the url, e.g. interact with the webpage, extract webpage content, etc.

79
qwen_integration_pr.md Normal file
View File

@@ -0,0 +1,79 @@
# 通义千问(Qwen)模型集成PR文档
## 功能概述
本PR为OWL项目添加了对阿里云通义千问(Qwen)模型的全面支持让OWL能够利用Qwen系列模型的强大能力特别是其多模态功能。
## 主要改进
1. **模型支持**
- 添加对通义千问(Qwen)文本模型的支持:`qwen-turbo``qwen-plus``qwen-max`
- 添加对通义千问多模态模型的支持:`qwen-omni-turbo`,支持图像、音频和视频输入
2. **工具集成**
- 优化`AudioAnalysisToolkit`,使其能够使用通义千问的多模态能力处理音频
- 优化`VideoAnalysisToolkit`,支持使用通义千问模型进行视频内容分析
- 修复了工具包中与模态处理相关的问题
3. **配置与环境**
- 添加通义千问所需的环境变量配置
- 设置默认模型配置选项,便于用户快速切换
4. **文档与示例**
- 提供完整的通义千问API调用示例文档
- 说明OpenAI兼容方式和DashScope方式两种调用方法
- 包含流式输出、多模态输入等高级用例
## 技术细节
### 修复的问题
- 修复了`ModelPlatformType``ModelType`的导入路径问题
- 修复了`QwenConfig`类的导入路径和使用问题
- 解决了`modalities`参数传递问题确保与通义千问API兼容
- 解决了由于`stream_options`设置导致的验证错误
-`token_limit`方法中添加了对`QWEN_OMNI_TURBO`的支持
### 改进的组件
- `camel/toolkits/audio_analysis_toolkit.py`: 支持通义千问模型处理音频
- `camel/toolkits/video_analysis_toolkit.py`: 支持通义千问模型处理视频
- `camel/types/enums.py`: 添加通义千问多模态模型的token限制
- `owl/.env`: 新增通义千问API相关环境变量配置
### 环境变量配置
```
# 通义千问API (https://help.aliyun.com/zh/model-studio/developer-reference/get-api-key)
QWEN_API_KEY=""
DASHSCOPE_API_KEY="" # OpenAI兼容方式使用同一个密钥
# 默认模型设置
DEFAULT_MODEL_PLATFORM_TYPE="tongyi-qianwen"
DEFAULT_MODEL_TYPE="qwen-turbo"
```
## 使用说明
通过设置环境变量可以轻松切换到通义千问模型:
1.`.env`文件中设置`QWEN_API_KEY``DASHSCOPE_API_KEY`
2.`DEFAULT_MODEL_PLATFORM_TYPE`设置为`"tongyi-qianwen"`
3.`DEFAULT_MODEL_TYPE`设置为所需的通义千问模型,如`"qwen-turbo"`
多模态功能使用示例:
```python
# 使用通义千问Omni模型分析音频
audio_tool = AudioAnalysisToolkit()
result = audio_tool.ask_question_about_audio("path/to/audio.mp3", "这段音频说了什么?")
```
## 测试与验证
- 验证了通义千问API的连接和基本功能
- 测试了音频和视频分析工具包的正常工作
- 验证了模型的流式输出功能
- 测试了OpenAI兼容方式调用的稳定性
## 后续工作
- 进一步优化多模态模型的参数配置
- 扩展对更多通义千问模型的支持
- 添加更多使用通义千问的上层应用示例