feat(eval): loc acc evaluation (#8515)

Co-authored-by: Xingyao Wang <xingyao@all-hands.dev> Co-authored-by: mamoodi <mamoodiha@gmail.com>
2025-12-26 05:48:36 +08:00 · 2025-07-10 14:22:35 -05:00 · 2025-07-10 14:22:35 -05:00 · 9388fef0ef
commit 9388fef0ef
parent 050e80cc34
5 changed files with 2388 additions and 0 deletions
--- a/evaluation/benchmarks/swe_bench/loc_eval/README.md
+++ b/evaluation/benchmarks/swe_bench/loc_eval/README.md
@ -0,0 +1,45 @@
+# **Localization Evaluation for SWE-Bench**
+
+This folder implements localization evaluation at both file and function levels to complementing the assessment of agent inference on [SWE-Bench](https://www.swebench.com/).
+
+## **1. Environment Setup**
+- Python env: [Install python environment](../../../README.md#development-environment)
+- LLM config: [Configure LLM config](../../../README.md#configure-openhands-and-your-llm)
+
+## **2. Inference & Evaluation**
+- Inference and evaluation follow the original `run_infer.sh` and `run_eval.sh` implementation
+    - You may refer to instructions at [README.md](../README.md) for running inference and evaluation on SWE-Bench
+
+## **3. Localization Evaluation**
+- Localization evaluation computes two-level localization accuracy, while also considers task success as an additional metric for overall evaluation:
+    - **File Localization Accuracy:** Accuracy of correctly localizing the target file
+    - **Function Localization Accuracy:** Accuracy of correctly localizing the target function
+    - **Resolve Rate** (will be auto-skipped if missing): Success rate of whether tasks are successfully resolved
+    - **File Localization Efficiency:** Average number of iterations taken to successfully localize the target file
+    - **Function Localization Efficiency:** Average number of iterations taken to successfully localize the target file
+    - **Task success efficiency:** Average number of iterations taken to resolve the task
+    - **Resource efficiency:** the API expenditure of the agent running inference on SWE-Bench instances
+
+- Run localization evaluation
+    - Format:
+        ```bash
+        ./evaluation/benchmarks/swe_bench/scripts/eval_localization.sh [infer-dir] [split] [dataset] [max-infer-turn] [align-with-max]
+        ```
+        - `infer-dir`: inference directory containing inference outputs
+        - `split`: SWE-Bench dataset split to use
+        - `dataset`: SWE-Bench dataset name
+        - `max-infer-turn`: the maximum number of iterations the agent took to run inference
+        - `align-with-max`: whether to align failure indices (e.g., incorrect localization, unresolved tasks) with `max_iter`
+
+    - Example:
+        ```bash
+        # Example
+        ./evaluation/benchmarks/swe_bench/scripts/eval_localization.sh \
+            --infer-dir ./evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Verified-test/CodeActAgent/gpt_4o_100_N \
+            --split test \
+            --dataset princeton-nlp/SWE-bench_Verified \
+            --max-infer-turn 100 \
+            --align-with-max true
+        ```
+
+- Localization evaluation results will be automatically saved to `[infer-dir]/loc_eval`
--- a/evaluation/benchmarks/swe_bench/loc_eval/init.py
+++ b/evaluation/benchmarks/swe_bench/loc_eval/init.py
--- a/evaluation/benchmarks/swe_bench/loc_eval/loc_evaluator.py
+++ b/evaluation/benchmarks/swe_bench/loc_eval/loc_evaluator.py
--- a/evaluation/benchmarks/swe_bench/loc_eval/loc_utils.py
+++ b/evaluation/benchmarks/swe_bench/loc_eval/loc_utils.py
--- a/evaluation/benchmarks/swe_bench/scripts/eval_localization.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/eval_localization.sh
@ -0,0 +1,227 @@
+#!/usr/bin/env bash
+set -eo pipefail
+source "evaluation/utils/version_control.sh"
+
+# Function to display usage information
+usage() {
+    echo "Usage: $0 [OPTIONS]"
+    echo "Options:"
+    echo "  --infer-dir DIR         Directory containing model inference outputs"
+    echo "  --split SPLIT           SWE-Bench dataset split selection"
+    echo "  --dataset DATASET       Dataset name"
+    echo "  --max-infer-turn NUM    Max number of turns for coding agent"
+    echo "  --align-with-max BOOL   Align failed instance indices with max iteration (true/false)"
+    echo "  -h, --help              Display this help message"
+    echo ""
+    echo "Example:"
+    echo "  $0 --infer-dir ./inference_outputs --split test --align-with-max false"
+}
+
+# Check if no arguments were provided
+if [ $# -eq 0 ]; then
+    usage
+    exit 1
+fi
+
+# Parse command line arguments
+while [[ $# -gt 0 ]]; do
+    case $1 in
+        --infer-dir)
+            INFER_DIR="$2"
+            shift 2
+            ;;
+        --split)
+            SPLIT="$2"
+            shift 2
+            ;;
+        --dataset)
+            DATASET="$2"
+            shift 2
+            ;;
+        --max-infer-turn)
+            MAX_TURN="$2"
+            shift 2
+            ;;
+        --align-with-max)
+            ALIGN_WITH_MAX="$2"
+            shift 2
+            ;;
+        -h|--help)
+            usage
+            exit 0
+            ;;
+        *)
+            echo "Unknown option: $1"
+            usage
+            exit 1
+            ;;
+    esac
+done
+
+# Check for required arguments (only INFER_DIR is required)
+if [ -z "$INFER_DIR" ]; then
+    echo "Error: Missing required arguments (--infer-dir is required)"
+    usage
+    exit 1
+fi
+
+# Set defaults for optional arguments if not provided
+if [ -z "$SPLIT" ]; then
+    SPLIT="test"
+    echo "Split not specified, using default: $SPLIT"
+fi
+
+if [ -z "$DATASET" ]; then
+    DATASET="princeton-nlp/SWE-bench_Verified"
+    echo "Dataset not specified, using default: $DATASET"
+fi
+
+if [ -z "$MAX_TURN" ]; then
+    MAX_TURN=20
+    echo "Max inference turn not specified, using default: $MAX_TURN"
+fi
+
+if [ -z "$ALIGN_WITH_MAX" ]; then
+    ALIGN_WITH_MAX="true"
+    echo "Align with max not specified, using default: $ALIGN_WITH_MAX"
+fi
+
+# Validate align-with-max value
+if [ "$ALIGN_WITH_MAX" != "true" ] && [ "$ALIGN_WITH_MAX" != "false" ]; then
+    print_error "Invalid value for --align-with-max: $ALIGN_WITH_MAX. Must be 'true' or 'false'"
+    exit 1
+fi
+
+# Color codes for output
+RED='\033[0;31m'
+GREEN='\033[0;32m'
+YELLOW='\033[1;33m'
+BLUE='\033[0;34m'
+NC='\033[0m' # No Color
+
+# Function to print colored output
+print_status() {
+    echo -e "${GREEN}[INFO]${NC} $1"
+}
+
+print_error() {
+    echo -e "${RED}[ERROR]${NC} $1"
+}
+
+print_warning() {
+    echo -e "${YELLOW}[WARNING]${NC} $1"
+}
+
+print_header() {
+    echo -e "${BLUE}[TASK]${NC} $1"
+}
+
+# Check if Python is available
+print_header "Checking Python installation..."
+if ! command -v python3 &> /dev/null; then
+    if ! command -v python &> /dev/null; then
+        print_error "Python is not installed or not in PATH"
+        exit 1
+    else
+        PYTHON_CMD="python"
+        print_status "Using python command"
+    fi
+else
+    PYTHON_CMD="python3"
+    print_status "Using python3 command"
+fi
+
+# Check if the Python script exists
+SCRIPT_NAME="./evaluation/benchmarks/swe_bench/loc_eval/loc_evaluator.py"
+if [ ! -f "$SCRIPT_NAME" ]; then
+    print_error "Python script '$SCRIPT_NAME' not found in current directory"
+    print_warning "Make sure the Python script is in the same directory as this bash script"
+    exit 1
+fi
+
+# Check if required directories exist
+print_header "Validating directories..."
+if [ ! -d "$INFER_DIR" ]; then
+    print_error "Inference directory not found: $INFER_DIR"
+    exit 1
+fi
+
+# Evaluation outputs
+EVAL_DIR="$INFER_DIR/eval_outputs"
+
+# Display configuration
+print_header "Starting Localization Evaluation with the following configuration:"
+echo "  Inference Directory:  $INFER_DIR"
+if [ -d "$EVAL_DIR" ]; then
+    echo "  Evaluation Directory:  $EVAL_DIR"
+else
+    echo "  Evaluation Directory:  None (evaluation outputs doesn't exist)"
+fi
+echo "  Output Directory:      $INFER_DIR/loc_eval"
+echo "  Split:                 $SPLIT"
+echo "  Dataset:               $DATASET"
+echo "  Max Turns:             $MAX_TURN"
+echo "  Align with Max:        $ALIGN_WITH_MAX"
+echo "  Python Command:        $PYTHON_CMD"
+echo ""
+
+# Check Python dependencies (optional check)
+print_header "Checking Python dependencies..."
+$PYTHON_CMD -c "
+import sys
+required_modules = ['pandas', 'json', 'os', 'argparse', 'collections']
+missing_modules = []
+
+for module in required_modules:
+    try:
+        __import__(module)
+    except ImportError:
+        missing_modules.append(module)
+
+if missing_modules:
+    print(f'Missing required modules: {missing_modules}')
+    sys.exit(1)
+else:
+    print('All basic dependencies are available')
+" || {
+    print_error "Some Python dependencies are missing"
+    print_warning "Please install required packages: pip install pandas"
+    exit 1
+}
+
+# Create log directory if doesn't exists
+mkdir -p "$INFER_DIR/loc_eval"
+
+# Set up logging
+LOG_FILE="$INFER_DIR/loc_eval/loc_evaluation_$(date +%Y%m%d_%H%M%S).log"
+print_status "Logging output to: $LOG_FILE"
+
+# Build the command
+CMD_ARGS="\"$SCRIPT_NAME\" \
+    --infer-dir \"$INFER_DIR\" \
+    --split \"$SPLIT\" \
+    --dataset \"$DATASET\" \
+    --max-infer-turn \"$MAX_TURN\" \
+    --align-with-max \"$ALIGN_WITH_MAX\""
+
+# Run the Python script
+print_header "Running localization evaluation..."
+eval "$PYTHON_CMD $CMD_ARGS" 2>&1 | tee "$LOG_FILE"
+
+# Check if the script ran successfully
+if [ ${PIPESTATUS[0]} -eq 0 ]; then
+    print_status "Localization evaluation completed successfully!"
+    print_status "Results saved to: $INFER_DIR/loc_eval"
+    print_status "Log file: $LOG_FILE"
+
+    # Display summary if results exist
+    if [ -f "$INFER_DIR/loc_eval/loc_eval_results/loc_acc/overall_eval.json" ]; then
+        print_header "Evaluation Summary:"
+        cat "$INFER_DIR/loc_eval/loc_eval_results/loc_acc/overall_eval.json"
+        echo
+    fi
+else
+    print_error "Localization evaluation failed!"
+    print_warning "Check the log file for details: $LOG_FILE"
+    exit 1
+fi