From 9271be1e48d2bad015c283925e9a88b4eb49d6fd Mon Sep 17 00:00:00 2001 From: Yuhang Zhou <1677382760@qq.com> Date: Sun, 18 May 2025 23:47:29 +0800 Subject: [PATCH] fix bugs and add requirements --- README.md | 3 +- requirements.txt | 142 ++++++++++++++++++++++++++++++++++++++++++ run_gaia_workforce.py | 2 +- utils/gaia.py | 11 ++-- 4 files changed, 151 insertions(+), 7 deletions(-) create mode 100644 requirements.txt diff --git a/README.md b/README.md index f41852a..8a1357d 100644 --- a/README.md +++ b/README.md @@ -9,8 +9,7 @@ This repository contains inference part code for the OWL framework (Workforce). ## Inference -The camel version we use is `0.2.46`. To reproduce Workforce inference performance (69.70% - Claude-3.7 accuracy on GAIA benchmark and 60.61% - GPT-4o -accuracy on GAIA benchmark), follow the steps below: +The camel version we use is `0.2.46`. To reproduce Workforce inference performance on GAIA benchmark (69.70% - Claude-3.7 accuracy on GAIA benchmark, pass@1, and 60.61% - GPT-4o accuracy on GAIA benchmark, pass@3), follow the steps below: ### Installation and Setup diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..838d6a1 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,142 @@ +# Core dependencies +numpy>=1.26.0 +openai>=1.59.7 +tiktoken>=0.7.0 +colorama>=0.4.6 +jsonschema>=4.0.0 +protobuf>=5.0.0 +docstring-parser>=0.15.0 +pydantic>=1.9.0,<2.10.0 +eval-type-backport==0.2.0 +curl_cffi==0.6.2 +httpx>=0.28.0,<1.0.0 +psutil>=5.9.8 +pillow>=10.1.0,<11.0.0 +retry>=0.9.2 +loguru>=0.7.3 +scenedetect>=0.6.5.2 +openpyxl>=3.1.5 +tabulate>=0.9.0 +xls2xlsx>=0.2.0 +docx2markdown>=0.1.1 +chunkr_ai>=0.0.41 +playwright>=1.50.0 +html2text>=2024.2.26 + +# Optional dependencies - Model platforms +litellm>=1.38.1 +mistralai>=1.1.0 +reka-api>=3.0.8 +anthropic>=0.42.0 +cohere>=5.11.0 +fish-audio-sdk>=2024.12.5 + +# Optional dependencies - Huggingface ecosystem +transformers>=4.0.0 +diffusers>=0.25.0 +accelerate>=0.26.0 +datasets>=3.0.0 +soundfile>=0.13.0 +sentencepiece>=0.2.0 +opencv-python>=4.0.0 + +# Optional dependencies - Core RAG components +sentence-transformers>=3.0.1 +qdrant-client>=1.9.0 +pymilvus>=2.4.0 +rank-bm25>=0.2.2 + +# Optional dependencies - Storage solutions +neo4j>=5.18.0 +nebula3-python==3.8.2 +redis>=5.0.6 +azure-storage-blob>=12.21.0 +google-cloud-storage>=2.18.0 +botocore>=1.35.3 + +# Optional dependencies - Document processing tools +beautifulsoup4>=4.0.0 +docx2txt>=0.8.0 +PyMuPDF>=1.22.5 +unstructured==0.16.20 +prance>=23.6.21.0 +openapi-spec-validator>=0.7.1 +pandasai>=2.3.0 + +# Optional dependencies - Media processing tools +imageio[pyav]>=2.34.2 +pydub>=0.25.1 +yt-dlp>=2024.11.4 +ffmpeg-python>=0.2.0 + +# Optional dependencies - Web and API tools +wikipedia>=1.0.0 +linkup-sdk>=0.2.1 +duckduckgo-search>=6.3.5 +newspaper3k>=0.2.8 +wolframalpha>=5.0.0 +pyowm>=3.3.0 +googlemaps>=4.10.0 +requests_oauthlib>=1.3.1 +firecrawl-py>=1.0.0 +apify_client>=1.8.1 +tavily-python>=0.5.0 +dappier>=0.3.3 +sympy>=1.13.3 + +# Optional dependencies - Communication platform tools +slack-sdk>=3.27.2 +slack-bolt>=1.20.1 +pygithub>=2.3.0 +pyTelegramBotAPI>=4.18.0 +discord.py>=2.3.2 +notion-client>=2.2.1 +praw>=7.7.1 + +# Optional dependencies - Data science and analytics tools +rouge>=1.0.1 +aiosqlite>=0.20.0 +textblob>=0.17.1 +datacommons>=1.4.3 +datacommons_pandas>=0.0.3 +pandas>=1.5.3 +stripe>=11.3.0 +networkx>=3.4.2 + +# Optional dependencies - Research tools +scholarly[tor]==1.7.11 +arxiv>=2.1.3 +arxiv2text>=0.1.14 + +# Optional dependencies - Development tools +outlines>=0.1.7 +docker>=7.1.0 +jupyter_client>=8.6.2 +ipykernel>=6.0.0 +agentops>=0.3.21 +e2b-code-interpreter>=1.0.3 +tree-sitter-python>=0.23.6 +tree-sitter>=0.23.2 +pyyaml>=6.0.2 + +# Development and testing tools +pytest>=7.0.0 +pytest-asyncio>=0.23.0 +mock>=5.0.0 +pytest-cov>=4.0.0 +ruff>=0.7.0 +mypy>=1.5.1 +toml>=0.10.2 +pre-commit>=3.0.0 +gradio>=3.0.0 + +# Type stubs +types-Pillow +types-Pygments +types-mock +types-regex +types-setuptools +types-tqdm +types-colorama>=0.0.0 +types-requests>=2.0.0 +types-PyYAML>=6.0.0 \ No newline at end of file diff --git a/run_gaia_workforce.py b/run_gaia_workforce.py index bda9f36..fe8d797 100644 --- a/run_gaia_workforce.py +++ b/run_gaia_workforce.py @@ -215,7 +215,7 @@ def evaluate_on_gaia(): MAX_TRIES = 1 SAVE_RESULT_PATH = f"results/workforce/workforce_{LEVEL}_pass{MAX_TRIES}_gpt4o.json" - test_idx = [0, 1, 2] + test_idx = [1] if os.path.exists(f"tmp/"): shutil.rmtree(f"tmp/") diff --git a/utils/gaia.py b/utils/gaia.py index e8b0dea..1b3f837 100644 --- a/utils/gaia.py +++ b/utils/gaia.py @@ -137,6 +137,10 @@ class GAIABenchmark(BaseBenchmark): def _save_results_to_file(self, results: List[Dict[str, Any]], file_path: str): + # get base dir of file_path + base_dir = os.path.dirname(file_path) + os.makedirs(base_dir, exist_ok=True) + with open(file_path, 'w', encoding='utf-8') as f: json.dump(results, f, indent=4, ensure_ascii=False) f.close() @@ -315,7 +319,7 @@ Please output with the final answer according to the requirements without any ot return self._generate_summary() - def run_single_agent_with_retry( + def run( self, agent: ChatAgent, on: Literal["valid", "test"], @@ -327,6 +331,7 @@ Please output with the final answer according to the requirements without any ot save_result: bool = False, ) -> Dict[str, Any]: + r"""Run the benchmark with a single agent.""" datas = self._load_tasks(on, level, randomize, subset, idx) @@ -424,7 +429,6 @@ Please output with the final answer according to the requirements without any ot subset: Optional[int] = None, idx: Optional[List[int]] = None, save_result: bool = False, - filtered_tasks_file_path: Optional[str] = None, ) -> Dict[str, Any]: r"""Run the benchmark with retry mechanism. @@ -439,11 +443,10 @@ Please output with the final answer according to the requirements without any ot subset (Optional[int]): Number of tasks to run. Defaults to None (all tasks). idx (Optional[List[int]]): Specific task indices to run. Defaults to None. save_result (bool): Whether to save results to file. Defaults to False. - filtered_tasks_file_path (Optional[str]): Path to the file containing filtered tasks. Defaults to None. Returns: Dict[str, Any]: Summary of benchmark results. """ - tasks = self._load_tasks(on, level, randomize, subset, idx, filtered_tasks_file_path) + tasks = self._load_tasks(on, level, randomize, subset, idx) self._results = []