fix bugs and add requirements

This commit is contained in:
Yuhang Zhou
2025-05-18 23:47:29 +08:00
parent eb7f19a983
commit 9271be1e48
4 changed files with 151 additions and 7 deletions

View File

@@ -9,8 +9,7 @@ This repository contains inference part code for the OWL framework (Workforce).
## Inference
The camel version we use is `0.2.46`. To reproduce Workforce inference performance (69.70% - Claude-3.7 accuracy on GAIA benchmark and 60.61% - GPT-4o
accuracy on GAIA benchmark), follow the steps below:
The camel version we use is `0.2.46`. To reproduce Workforce inference performance on GAIA benchmark (69.70% - Claude-3.7 accuracy on GAIA benchmark, pass@1, and 60.61% - GPT-4o accuracy on GAIA benchmark, pass@3), follow the steps below:
### Installation and Setup

142
requirements.txt Normal file
View File

@@ -0,0 +1,142 @@
# Core dependencies
numpy>=1.26.0
openai>=1.59.7
tiktoken>=0.7.0
colorama>=0.4.6
jsonschema>=4.0.0
protobuf>=5.0.0
docstring-parser>=0.15.0
pydantic>=1.9.0,<2.10.0
eval-type-backport==0.2.0
curl_cffi==0.6.2
httpx>=0.28.0,<1.0.0
psutil>=5.9.8
pillow>=10.1.0,<11.0.0
retry>=0.9.2
loguru>=0.7.3
scenedetect>=0.6.5.2
openpyxl>=3.1.5
tabulate>=0.9.0
xls2xlsx>=0.2.0
docx2markdown>=0.1.1
chunkr_ai>=0.0.41
playwright>=1.50.0
html2text>=2024.2.26
# Optional dependencies - Model platforms
litellm>=1.38.1
mistralai>=1.1.0
reka-api>=3.0.8
anthropic>=0.42.0
cohere>=5.11.0
fish-audio-sdk>=2024.12.5
# Optional dependencies - Huggingface ecosystem
transformers>=4.0.0
diffusers>=0.25.0
accelerate>=0.26.0
datasets>=3.0.0
soundfile>=0.13.0
sentencepiece>=0.2.0
opencv-python>=4.0.0
# Optional dependencies - Core RAG components
sentence-transformers>=3.0.1
qdrant-client>=1.9.0
pymilvus>=2.4.0
rank-bm25>=0.2.2
# Optional dependencies - Storage solutions
neo4j>=5.18.0
nebula3-python==3.8.2
redis>=5.0.6
azure-storage-blob>=12.21.0
google-cloud-storage>=2.18.0
botocore>=1.35.3
# Optional dependencies - Document processing tools
beautifulsoup4>=4.0.0
docx2txt>=0.8.0
PyMuPDF>=1.22.5
unstructured==0.16.20
prance>=23.6.21.0
openapi-spec-validator>=0.7.1
pandasai>=2.3.0
# Optional dependencies - Media processing tools
imageio[pyav]>=2.34.2
pydub>=0.25.1
yt-dlp>=2024.11.4
ffmpeg-python>=0.2.0
# Optional dependencies - Web and API tools
wikipedia>=1.0.0
linkup-sdk>=0.2.1
duckduckgo-search>=6.3.5
newspaper3k>=0.2.8
wolframalpha>=5.0.0
pyowm>=3.3.0
googlemaps>=4.10.0
requests_oauthlib>=1.3.1
firecrawl-py>=1.0.0
apify_client>=1.8.1
tavily-python>=0.5.0
dappier>=0.3.3
sympy>=1.13.3
# Optional dependencies - Communication platform tools
slack-sdk>=3.27.2
slack-bolt>=1.20.1
pygithub>=2.3.0
pyTelegramBotAPI>=4.18.0
discord.py>=2.3.2
notion-client>=2.2.1
praw>=7.7.1
# Optional dependencies - Data science and analytics tools
rouge>=1.0.1
aiosqlite>=0.20.0
textblob>=0.17.1
datacommons>=1.4.3
datacommons_pandas>=0.0.3
pandas>=1.5.3
stripe>=11.3.0
networkx>=3.4.2
# Optional dependencies - Research tools
scholarly[tor]==1.7.11
arxiv>=2.1.3
arxiv2text>=0.1.14
# Optional dependencies - Development tools
outlines>=0.1.7
docker>=7.1.0
jupyter_client>=8.6.2
ipykernel>=6.0.0
agentops>=0.3.21
e2b-code-interpreter>=1.0.3
tree-sitter-python>=0.23.6
tree-sitter>=0.23.2
pyyaml>=6.0.2
# Development and testing tools
pytest>=7.0.0
pytest-asyncio>=0.23.0
mock>=5.0.0
pytest-cov>=4.0.0
ruff>=0.7.0
mypy>=1.5.1
toml>=0.10.2
pre-commit>=3.0.0
gradio>=3.0.0
# Type stubs
types-Pillow
types-Pygments
types-mock
types-regex
types-setuptools
types-tqdm
types-colorama>=0.0.0
types-requests>=2.0.0
types-PyYAML>=6.0.0

View File

@@ -215,7 +215,7 @@ def evaluate_on_gaia():
MAX_TRIES = 1
SAVE_RESULT_PATH = f"results/workforce/workforce_{LEVEL}_pass{MAX_TRIES}_gpt4o.json"
test_idx = [0, 1, 2]
test_idx = [1]
if os.path.exists(f"tmp/"):
shutil.rmtree(f"tmp/")

View File

@@ -137,6 +137,10 @@ class GAIABenchmark(BaseBenchmark):
def _save_results_to_file(self, results: List[Dict[str, Any]], file_path: str):
# get base dir of file_path
base_dir = os.path.dirname(file_path)
os.makedirs(base_dir, exist_ok=True)
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(results, f, indent=4, ensure_ascii=False)
f.close()
@@ -315,7 +319,7 @@ Please output with the final answer according to the requirements without any ot
return self._generate_summary()
def run_single_agent_with_retry(
def run(
self,
agent: ChatAgent,
on: Literal["valid", "test"],
@@ -327,6 +331,7 @@ Please output with the final answer according to the requirements without any ot
save_result: bool = False,
) -> Dict[str, Any]:
r"""Run the benchmark with a single agent."""
datas = self._load_tasks(on, level, randomize, subset, idx)
@@ -424,7 +429,6 @@ Please output with the final answer according to the requirements without any ot
subset: Optional[int] = None,
idx: Optional[List[int]] = None,
save_result: bool = False,
filtered_tasks_file_path: Optional[str] = None,
) -> Dict[str, Any]:
r"""Run the benchmark with retry mechanism.
@@ -439,11 +443,10 @@ Please output with the final answer according to the requirements without any ot
subset (Optional[int]): Number of tasks to run. Defaults to None (all tasks).
idx (Optional[List[int]]): Specific task indices to run. Defaults to None.
save_result (bool): Whether to save results to file. Defaults to False.
filtered_tasks_file_path (Optional[str]): Path to the file containing filtered tasks. Defaults to None.
Returns:
Dict[str, Any]: Summary of benchmark results.
"""
tasks = self._load_tasks(on, level, randomize, subset, idx, filtered_tasks_file_path)
tasks = self._load_tasks(on, level, randomize, subset, idx)
self._results = []