diff --git a/.gitignore b/.gitignore index 1496b4d..2a69342 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,7 @@ owl/data owl/tmp owl/.env owl/utils/__pycache__/ +owl/results # Logs *.log diff --git a/README.md b/README.md index 5be8102..7545015 100644 --- a/README.md +++ b/README.md @@ -193,6 +193,10 @@ python owl/run_deepseek.py # Run with other OpenAI-compatible models python owl/run_openai_compatiable_model.py + +# Run with VLLM backends (using Qwen2.5-VL-7B-Instruct w/ 4 GPUs as an example) +bash owl/scripts/serve.sh # run this under another terminal or screen +python owl/run_vllm/py ``` For a simpler version that only requires an LLM API key, you can try our minimal example: @@ -239,7 +243,7 @@ We provided a script to reproduce the results on GAIA. You can check the `run_gaia_roleplaying.py` file and run the following command: ```bash -python run_gaia_roleplaying.py +python owl/run_gaia_roleplaying.py ``` # ⏱️ Future Plans diff --git a/owl/run_gaia_roleplaying_vllm.py b/owl/run_gaia_roleplaying_vllm.py new file mode 100644 index 0000000..fee2f21 --- /dev/null +++ b/owl/run_gaia_roleplaying_vllm.py @@ -0,0 +1,134 @@ +""" +This script demonstrates how to run the OWL system with an open-source model +on VLLM as the user agent. + +Pre-requisites: bash scripts/serve.sh (4 GPUs for qwen2.5-vl-7b-instruct). +""" + +from dotenv import load_dotenv +load_dotenv() + +import os +from loguru import logger + +from camel.models import ModelFactory +from camel.toolkits import ( + AudioAnalysisToolkit, + CodeExecutionToolkit, + DocumentProcessingToolkit, + ExcelToolkit, + ImageAnalysisToolkit, + SearchToolkit, + VideoAnalysisToolkit, + WebToolkit, +) +from camel.types import ModelPlatformType, ModelType +from camel.configs import ChatGPTConfig + +from utils import GAIABenchmark + + +# Configuration +LEVEL = 1 +SAVE_RESULT = True +test_idx = [0] +VLLM_MODEL_TYPE = "Qwen/Qwen2.5-VL-7B-Instruct" # set the VLLM model type +PORT = 8964 # set the port for the VLLM model +SAVE_TO = "results/result_vllm.json" + + +def main(): + """Main function to run the GAIA benchmark.""" + # Create cache directory + cache_dir = "tmp/" + os.makedirs(cache_dir, exist_ok=True) + + # Create models for different components + models = { + "user": ModelFactory.create( + model_platform=ModelPlatformType.VLLM, + model_type=VLLM_MODEL_TYPE, + model_config_dict={"temperature": 0., "top_p": 1.}, + url=f"http://localhost:{PORT}/v1", + ), + "assistant": ModelFactory.create( + model_platform=ModelPlatformType.OPENAI, + model_type=ModelType.GPT_4O, + model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), + ), + "web": ModelFactory.create( + model_platform=ModelPlatformType.OPENAI, + model_type=ModelType.GPT_4O, + model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), + ), + "planning": ModelFactory.create( + model_platform=ModelPlatformType.OPENAI, + model_type=ModelType.GPT_4O, + model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), + ), + "video": ModelFactory.create( + model_platform=ModelPlatformType.OPENAI, + model_type=ModelType.GPT_4O, + model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), + ), + "image": ModelFactory.create( + model_platform=ModelPlatformType.OPENAI, + model_type=ModelType.GPT_4O, + model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), + ), + "search": ModelFactory.create( + model_platform=ModelPlatformType.OPENAI, + model_type=ModelType.GPT_4O, + model_config_dict=ChatGPTConfig(temperature=0, top_p=1).as_dict(), + ), + } + + # Configure toolkits + tools = [ + *WebToolkit( + headless=True, # Set to True for headless mode (e.g., on remote servers) + web_agent_model=models["web"], + planning_agent_model=models["planning"], + ).get_tools(), + *DocumentProcessingToolkit().get_tools(), + *VideoAnalysisToolkit(model=models["video"]).get_tools(), # This requires OpenAI Key + *AudioAnalysisToolkit().get_tools(), # This requires OpenAI Key + *CodeExecutionToolkit(sandbox="subprocess", verbose=True).get_tools(), + *ImageAnalysisToolkit(model=models["image"]).get_tools(), + *SearchToolkit(model=models["search"]).get_tools(), + *ExcelToolkit().get_tools(), + ] + + # Configure agent roles and parameters + user_agent_kwargs = {"model": models["user"]} + assistant_agent_kwargs = {"model": models["assistant"], "tools": tools} + + # Initialize benchmark + benchmark = GAIABenchmark( + data_dir="data/gaia", + save_to=SAVE_TO, + ) + + # Print benchmark information + print(f"Number of validation examples: {len(benchmark.valid)}") + print(f"Number of test examples: {len(benchmark.test)}") + + # Run benchmark + result = benchmark.run( + on="valid", + level=LEVEL, + idx=test_idx, + save_result=SAVE_RESULT, + user_role_name="user", + user_agent_kwargs=user_agent_kwargs, + assistant_role_name="assistant", + assistant_agent_kwargs=assistant_agent_kwargs, + ) + + # Output results + logger.success(f"Correct: {result['correct']}, Total: {result['total']}") + logger.success(f"Accuracy: {result['accuracy']}") + + +if __name__ == "__main__": + main() diff --git a/owl/run_vllm.py b/owl/run_vllm.py new file mode 100644 index 0000000..c430ed7 --- /dev/null +++ b/owl/run_vllm.py @@ -0,0 +1,136 @@ +""" +This script demonstrates how to run the OWL system with an open-source model +on VLLM as the user agent. + +Pre-requisites: bash scripts/serve.sh (4 GPUs for qwen2.5-vl-7b-instruct). +""" + +from dotenv import load_dotenv +load_dotenv() + +from camel.models import ModelFactory +from camel.toolkits import ( + AudioAnalysisToolkit, + CodeExecutionToolkit, + DocumentProcessingToolkit, + ExcelToolkit, + ImageAnalysisToolkit, + SearchToolkit, + VideoAnalysisToolkit, + WebToolkit, +) +from camel.types import ModelPlatformType, ModelType + +from utils import OwlRolePlaying, run_society + +VLLM_MODEL_TYPE = "Qwen/Qwen2.5-VL-7B-Instruct" # set the VLLM model type +PORT = 8964 # set the port for the VLLM model + + +def construct_society(question: str) -> OwlRolePlaying: + r"""Construct a society of agents based on the given question. + + Args: + question (str): The task or question to be addressed by the society. + + Returns: + OwlRolePlaying: A configured society of agents ready to address the question. + """ + + # Create models for different components + models = { + "user": ModelFactory.create( + model_platform=ModelPlatformType.VLLM, + model_type=VLLM_MODEL_TYPE, + model_config_dict={"temperature": 0.}, + url=f"http://localhost:{PORT}/v1", + ), + "assistant": ModelFactory.create( + model_platform=ModelPlatformType.OPENAI, + model_type=ModelType.GPT_4O, + model_config_dict={"temperature": 0}, + ), + "web": ModelFactory.create( + model_platform=ModelPlatformType.OPENAI, + model_type=ModelType.GPT_4O, + model_config_dict={"temperature": 0}, + ), + "planning": ModelFactory.create( + model_platform=ModelPlatformType.OPENAI, + model_type=ModelType.GPT_4O, + model_config_dict={"temperature": 0}, + ), + "video": ModelFactory.create( + model_platform=ModelPlatformType.OPENAI, + model_type=ModelType.GPT_4O, + model_config_dict={"temperature": 0}, + ), + "image": ModelFactory.create( + model_platform=ModelPlatformType.OPENAI, + model_type=ModelType.GPT_4O, + model_config_dict={"temperature": 0}, + ), + "search": ModelFactory.create( + model_platform=ModelPlatformType.OPENAI, + model_type=ModelType.GPT_4O, + model_config_dict={"temperature": 0}, + ), + } + + # Configure toolkits + tools = [ + *WebToolkit( + headless=True, # Set to True for headless mode (e.g., on remote servers) + web_agent_model=models["web"], + planning_agent_model=models["planning"], + ).get_tools(), + *DocumentProcessingToolkit().get_tools(), + *VideoAnalysisToolkit(model=models["video"]).get_tools(), # This requires OpenAI Key + *AudioAnalysisToolkit().get_tools(), # This requires OpenAI Key + *CodeExecutionToolkit(sandbox="subprocess", verbose=True).get_tools(), + *ImageAnalysisToolkit(model=models["image"]).get_tools(), + *SearchToolkit(model=models["search"]).get_tools(), + *ExcelToolkit().get_tools(), + ] + + # Configure agent roles and parameters + user_agent_kwargs = {"model": models["user"]} + assistant_agent_kwargs = {"model": models["assistant"], "tools": tools} + + # Configure task parameters + task_kwargs = { + "task_prompt": question, + "with_task_specify": False, + } + + # Create and return the society + society = OwlRolePlaying( + **task_kwargs, + user_role_name="user", + user_agent_kwargs=user_agent_kwargs, + assistant_role_name="assistant", + assistant_agent_kwargs=assistant_agent_kwargs, + ) + + return society + + +def main(): + r"""Main function to run the OWL system with an example question.""" + # Example research question + question = ( + "What was the volume in m^3 of the fish bag that was calculated in " + "the University of Leicester paper `Can Hiccup Supply Enough Fish " + "to Maintain a Dragon's Diet?`" + ) + + # Construct and run the society + society = construct_society(question) + answer, chat_history, token_count = run_society(society) + + # Output the result + print(f"Answer: {answer}") + + +if __name__ == "__main__": + main() diff --git a/owl/scripts/serve.sh b/owl/scripts/serve.sh new file mode 100644 index 0000000..3a0fe1a --- /dev/null +++ b/owl/scripts/serve.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +# tested vllm version: 0.7.3 + +export VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 + +model=${1:-Qwen/Qwen2.5-VL-7B-Instruct} +max_model_len=${2:-32768} +gpu_memory_utilization=${3:-0.9} +devices=${4:-"0,1,2,3"} +tp_size=$(awk -F',' '{print NF}' <<< "$devices") +port=${5:-8964} + +CUDA_VISIBLE_DEVICES=${devices} python -m vllm.entrypoints.openai.api_server \ + --model ${model} \ + --tensor_parallel_size ${tp_size} \ + --gpu_memory_utilization ${gpu_memory_utilization} \ + --port ${port} \ + --max_model_len ${max_model_len} \ diff --git a/owl/utils/gaia.py b/owl/utils/gaia.py index eae0e10..dacac40 100644 --- a/owl/utils/gaia.py +++ b/owl/utils/gaia.py @@ -252,6 +252,7 @@ class GAIABenchmark(BaseBenchmark): if save_result: + os.makedirs(os.path.dirname(self.save_to), exist_ok=True) with open(self.save_to, 'w') as f: json.dump(self._results, f, indent=4, ensure_ascii=False) f.close() @@ -370,6 +371,10 @@ class GAIABenchmark(BaseBenchmark): def normalize_number_str(self, number_str: str) -> float: + if number_str is None: + logger.error("Received None as number string, returning infinity.") + return float("inf") + for char in ["$", "%", ","]: number_str = number_str.replace(char, "") try: