From 9271be1e48d2bad015c283925e9a88b4eb49d6fd Mon Sep 17 00:00:00 2001
From: Yuhang Zhou <1677382760@qq.com>
Date: Sun, 18 May 2025 23:47:29 +0800
Subject: [PATCH] fix bugs and add requirements

---
 README.md             |   3 +-
 requirements.txt      | 142 ++++++++++++++++++++++++++++++++++++++++++
 run_gaia_workforce.py |   2 +-
 utils/gaia.py         |  11 ++--
 4 files changed, 151 insertions(+), 7 deletions(-)
 create mode 100644 requirements.txt

diff --git a/README.md b/README.md
index f41852a..8a1357d 100644
--- a/README.md
+++ b/README.md
@@ -9,8 +9,7 @@ This repository contains inference part code for the OWL framework (Workforce).
 
 ## Inference
 
-The camel version we use is `0.2.46`. To reproduce Workforce inference performance (69.70% - Claude-3.7 accuracy on GAIA benchmark and 60.61% - GPT-4o
-accuracy on GAIA benchmark), follow the steps below:
+The camel version we use is `0.2.46`. To reproduce Workforce inference performance on GAIA benchmark (69.70% - Claude-3.7 accuracy on GAIA benchmark, pass@1, and 60.61% - GPT-4o accuracy on GAIA benchmark, pass@3), follow the steps below:
 
 ### Installation and Setup
 
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..838d6a1
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,142 @@
+# Core dependencies
+numpy>=1.26.0
+openai>=1.59.7
+tiktoken>=0.7.0
+colorama>=0.4.6
+jsonschema>=4.0.0
+protobuf>=5.0.0
+docstring-parser>=0.15.0
+pydantic>=1.9.0,<2.10.0
+eval-type-backport==0.2.0
+curl_cffi==0.6.2
+httpx>=0.28.0,<1.0.0
+psutil>=5.9.8
+pillow>=10.1.0,<11.0.0
+retry>=0.9.2
+loguru>=0.7.3
+scenedetect>=0.6.5.2
+openpyxl>=3.1.5
+tabulate>=0.9.0
+xls2xlsx>=0.2.0
+docx2markdown>=0.1.1
+chunkr_ai>=0.0.41
+playwright>=1.50.0
+html2text>=2024.2.26
+
+# Optional dependencies - Model platforms
+litellm>=1.38.1
+mistralai>=1.1.0
+reka-api>=3.0.8
+anthropic>=0.42.0
+cohere>=5.11.0
+fish-audio-sdk>=2024.12.5
+
+# Optional dependencies - Huggingface ecosystem
+transformers>=4.0.0
+diffusers>=0.25.0
+accelerate>=0.26.0
+datasets>=3.0.0
+soundfile>=0.13.0
+sentencepiece>=0.2.0
+opencv-python>=4.0.0
+
+# Optional dependencies - Core RAG components
+sentence-transformers>=3.0.1
+qdrant-client>=1.9.0
+pymilvus>=2.4.0
+rank-bm25>=0.2.2
+
+# Optional dependencies - Storage solutions
+neo4j>=5.18.0
+nebula3-python==3.8.2
+redis>=5.0.6
+azure-storage-blob>=12.21.0
+google-cloud-storage>=2.18.0
+botocore>=1.35.3
+
+# Optional dependencies - Document processing tools
+beautifulsoup4>=4.0.0
+docx2txt>=0.8.0
+PyMuPDF>=1.22.5
+unstructured==0.16.20
+prance>=23.6.21.0
+openapi-spec-validator>=0.7.1
+pandasai>=2.3.0
+
+# Optional dependencies - Media processing tools
+imageio[pyav]>=2.34.2
+pydub>=0.25.1
+yt-dlp>=2024.11.4
+ffmpeg-python>=0.2.0
+
+# Optional dependencies - Web and API tools
+wikipedia>=1.0.0
+linkup-sdk>=0.2.1
+duckduckgo-search>=6.3.5
+newspaper3k>=0.2.8
+wolframalpha>=5.0.0
+pyowm>=3.3.0
+googlemaps>=4.10.0
+requests_oauthlib>=1.3.1
+firecrawl-py>=1.0.0
+apify_client>=1.8.1
+tavily-python>=0.5.0
+dappier>=0.3.3
+sympy>=1.13.3
+
+# Optional dependencies - Communication platform tools
+slack-sdk>=3.27.2
+slack-bolt>=1.20.1
+pygithub>=2.3.0
+pyTelegramBotAPI>=4.18.0
+discord.py>=2.3.2
+notion-client>=2.2.1
+praw>=7.7.1
+
+# Optional dependencies - Data science and analytics tools
+rouge>=1.0.1
+aiosqlite>=0.20.0
+textblob>=0.17.1
+datacommons>=1.4.3
+datacommons_pandas>=0.0.3
+pandas>=1.5.3
+stripe>=11.3.0
+networkx>=3.4.2
+
+# Optional dependencies - Research tools
+scholarly[tor]==1.7.11
+arxiv>=2.1.3
+arxiv2text>=0.1.14
+
+# Optional dependencies - Development tools
+outlines>=0.1.7
+docker>=7.1.0
+jupyter_client>=8.6.2
+ipykernel>=6.0.0
+agentops>=0.3.21
+e2b-code-interpreter>=1.0.3
+tree-sitter-python>=0.23.6
+tree-sitter>=0.23.2
+pyyaml>=6.0.2
+
+# Development and testing tools
+pytest>=7.0.0
+pytest-asyncio>=0.23.0
+mock>=5.0.0
+pytest-cov>=4.0.0
+ruff>=0.7.0
+mypy>=1.5.1
+toml>=0.10.2
+pre-commit>=3.0.0
+gradio>=3.0.0
+
+# Type stubs
+types-Pillow
+types-Pygments
+types-mock
+types-regex
+types-setuptools
+types-tqdm
+types-colorama>=0.0.0
+types-requests>=2.0.0
+types-PyYAML>=6.0.0
\ No newline at end of file
diff --git a/run_gaia_workforce.py b/run_gaia_workforce.py
index bda9f36..fe8d797 100644
--- a/run_gaia_workforce.py
+++ b/run_gaia_workforce.py
@@ -215,7 +215,7 @@ def evaluate_on_gaia():
     MAX_TRIES = 1
     
     SAVE_RESULT_PATH = f"results/workforce/workforce_{LEVEL}_pass{MAX_TRIES}_gpt4o.json"
-    test_idx = [0, 1, 2]
+    test_idx = [1]
 
     if os.path.exists(f"tmp/"):
         shutil.rmtree(f"tmp/")
diff --git a/utils/gaia.py b/utils/gaia.py
index e8b0dea..1b3f837 100644
--- a/utils/gaia.py
+++ b/utils/gaia.py
@@ -137,6 +137,10 @@ class GAIABenchmark(BaseBenchmark):
     
     
     def _save_results_to_file(self, results: List[Dict[str, Any]], file_path: str):
+        # get base dir of file_path
+        base_dir = os.path.dirname(file_path)
+        os.makedirs(base_dir, exist_ok=True)
+        
         with open(file_path, 'w', encoding='utf-8') as f:
             json.dump(results, f, indent=4, ensure_ascii=False)
         f.close()
@@ -315,7 +319,7 @@ Please output with the final answer according to the requirements without any ot
         return self._generate_summary()
 
     
-    def run_single_agent_with_retry(
+    def run(
         self,
         agent: ChatAgent,
         on: Literal["valid", "test"],
@@ -327,6 +331,7 @@ Please output with the final answer according to the requirements without any ot
         save_result: bool = False,
         
     ) -> Dict[str, Any]:
+        r"""Run the benchmark with a single agent."""
         
         datas = self._load_tasks(on, level, randomize, subset, idx)
         
@@ -424,7 +429,6 @@ Please output with the final answer according to the requirements without any ot
         subset: Optional[int] = None,
         idx: Optional[List[int]] = None,
         save_result: bool = False,
-        filtered_tasks_file_path: Optional[str] = None,
     ) -> Dict[str, Any]:
         r"""Run the benchmark with retry mechanism.
 
@@ -439,11 +443,10 @@ Please output with the final answer according to the requirements without any ot
             subset (Optional[int]): Number of tasks to run. Defaults to None (all tasks).
             idx (Optional[List[int]]): Specific task indices to run. Defaults to None.
             save_result (bool): Whether to save results to file. Defaults to False.
-            filtered_tasks_file_path (Optional[str]): Path to the file containing filtered tasks. Defaults to None.
         Returns:
             Dict[str, Any]: Summary of benchmark results.
         """
-        tasks = self._load_tasks(on, level, randomize, subset, idx, filtered_tasks_file_path)
+        tasks = self._load_tasks(on, level, randomize, subset, idx)
         
         self._results = []