feat: Support Tau-Bench and BFCL evaluation benchmarks (#11953)

Co-authored-by: openhands <openhands@all-hands.dev>
2026-03-22 05:37:20 +08:00 · 2025-12-31 06:12:50 +03:00
parent 82e0aa7924
commit 4c0f0a1e9b
6 changed files with 469 additions and 2 deletions
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -192,6 +192,9 @@ datasets = "*"
 joblib = "*"
 swebench = { git = "https://github.com/ryanhoangt/SWE-bench.git", rev = "fix-modal-patch-eval" }
 multi-swe-bench = "0.1.2"
+pandas = "*"
+# tau-bench = { git = "https://github.com/sierra-research/tau-bench.git" }
+# bfcl-eval = "*" # TODO: Verify exact package name/source

 [tool.poetry.group.testgeneval.dependencies]
 fuzzywuzzy = "^0.18.0"