feat(evaluation): Add NoCode-bench evaluation script (#10229)

2025-12-26 05:48:36 +08:00 · 2025-08-17 00:41:22 +08:00 · 2025-08-17 00:41:22 +08:00 · 7229a16b45
commit 7229a16b45
parent 19105a2a13
15 changed files with 2199 additions and 0 deletions
--- a/evaluation/benchmarks/nocode_bench/README.md
+++ b/evaluation/benchmarks/nocode_bench/README.md
@ -0,0 +1,45 @@
+# Evaluate OpenHands on NoCode-bench
+
+## LLM Setup
+
+Please follow [here](../../README.md#setup).
+
+
+## Docker image download
+
+Evaluating OpenHands on NoCode-bench need instance-level docker image.
+Please follow the instructions of NoCode-bench image setup to build or download all instance-level dokcer [here](https://github.com/NoCode-bench/NoCode-bench).
+
+## Generate patch
+
+Please follow the instructions [here](../swe_bench/README.md#running-locally-with-docker)
+For example,
+```bash
+bash ./evaluation/benchmarks/nocode_bench/scripts/run_infer_nc.sh llm.claude HEAD CodeActAgent 114 100 10 NoCode-bench/NoCode-bench_Verified test
+```
+The results will be generated in evaluation/evaluation_outputs/outputs/XXX/CodeActAgent/YYY/output.jsonl.
+
+## Runing evaluation
+
+First, install [NoCode-bench](https://github.com/NoCode-bench/NoCode-bench).
+
+Second, convert the output.jsonl to patch.jsonl with [script](scripts/eval/convert.py).
+
+```bash
+python evaluation/benchmarks/multi_swe_bench/scripts/eval/convert.py
+```
+
+Finally, evaluate with NoCode-bench.
+
+```bash
+export PYTHONPATH=$PYTHONPATH:$(pwd)
+python ./evaluation/eval.py \
+    --predictions_path ./all_preds.jsonl \  # <path_to_your_predictions>
+    --log_dir ./evaluation/logs \ # <path_to_your_log_dir>
+    --bench_tasks NoCode-bench/NoCode-bench_Verified \ # <dataset_name>
+    --max_workers 110 \ # <number_of_workers>
+    --output_file eval_result.txt \ # <path_to_your_output_file>
+    --image_level repo \ # <cache_image_level>
+    --timeout 600 \ # <timeout_in_seconds>
+    --proxy None # <proxy_if_needed>
+```
--- a/evaluation/benchmarks/nocode_bench/init.py
+++ b/evaluation/benchmarks/nocode_bench/init.py
--- a/evaluation/benchmarks/nocode_bench/binary_patch_utils.py
+++ b/evaluation/benchmarks/nocode_bench/binary_patch_utils.py
@ -0,0 +1,52 @@
+"""
+Utilities for handling binary files and patch generation in SWE-bench evaluation.
+"""
+
+
+def remove_binary_diffs(patch_text):
+    """
+    Remove binary file diffs from a git patch.
+
+    Args:
+        patch_text (str): The git patch text
+
+    Returns:
+        str: The cleaned patch text with binary diffs removed
+    """
+    lines = patch_text.splitlines()
+    cleaned_lines = []
+    block = []
+    is_binary_block = False
+
+    for line in lines:
+        if line.startswith('diff --git '):
+            if block and not is_binary_block:
+                cleaned_lines.extend(block)
+            block = [line]
+            is_binary_block = False
+        elif 'Binary files' in line:
+            is_binary_block = True
+            block.append(line)
+        else:
+            block.append(line)
+
+    if block and not is_binary_block:
+        cleaned_lines.extend(block)
+    return '\n'.join(cleaned_lines)
+
+
+def remove_binary_files_from_git():
+    """
+    Generate a bash command to remove binary files from git staging.
+
+    Returns:
+        str: A bash command that removes binary files from git staging
+    """
+    return """
+    for file in $(git status --porcelain | grep -E "^(M| M|\\?\\?|A| A)" | cut -c4-); do
+        if [ -f "$file" ] && (file "$file" | grep -q "executable" || git check-attr binary "$file" | grep -q "binary: set"); then
+            git rm -f "$file" 2>/dev/null || rm -f "$file"
+            echo "Removed: $file"
+        fi
+    done
+    """.strip()
--- a/evaluation/benchmarks/nocode_bench/consistants.py
+++ b/evaluation/benchmarks/nocode_bench/consistants.py
@ -0,0 +1,545 @@
+DOCPATH_PATTERNS = [
+    r'docs/',
+    r'^CHANGES\.rst$',
+    r'doc/',
+    r'ChangeLog',
+    r'^changelog/',
+    r'^CHANGES$',
+]
+
+MATPLOTLIB_CONFIG = {
+    k: {
+        'python': '3.11',
+        'conda_env': 'matplotlib_35',
+        'install': 'python -m pip install -e .',
+        'test_cmd': 'pytest -rA --color=no',
+    }
+    for k in ['3.5', '3.6', '3.7', '3.8', '3.9']
+}
+MATPLOTLIB_CONFIG.update(
+    {
+        k: {
+            'python': '3.8',
+            'conda_env': 'matplotlib_31',
+            'install': 'python -m pip install -e .',
+            'test_cmd': 'pytest -rA --color=no',
+        }
+        for k in ['3.1', '3.2', '3.3', '3.4']
+    }
+)
+MATPLOTLIB_CONFIG.update(
+    {
+        k: {
+            'python': '3.5',
+            'install': 'python setup.py build; python setup.py install',
+            'conda_env': 'matplotlib_11',
+            'nonroot': True,
+            'test_cmd': 'pytest -rA --color=no',
+        }
+        for k in ['2.0', '2.1', '2.2', '1.0', '1.1', '1.2', '1.3', '1.4', '1.5']
+    }
+)
+for k in ['3.8', '3.9']:
+    MATPLOTLIB_CONFIG[k]['install'] = (
+        'python -m pip install --no-build-isolation -e ".[dev]"'
+    )
+
+
+SYMPY_CONFIG = {}
+SYMPY_CONFIG.update(
+    {
+        '1.0': {
+            'conda_env': 'sympy_10',
+            'install': 'pip install -e .',
+            'test_cmd': 'bin/test -C -v',
+            # testfile -k testname
+        }
+    }
+)
+
+REQUESTS_CONFIG = {}
+REQUESTS_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'requests_227',
+            'install': 'pip install -r requirements-dev.txt',
+            'test_cmd': 'pytest -rA',
+        }
+        for k in ['2.27']
+    }
+)
+REQUESTS_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'requests_226',
+            'install': 'pip install -e .',
+            'test_cmd': 'pytest -rA',
+        }
+        for k in ['2.26']
+    }
+)
+
+PYTEST_CONFIG = {}
+PYTEST_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'pytest_33',
+            'install': 'pip install -e .',
+            'test_cmd': 'pytest -v --color=no',
+        }
+        for k in ['4.4', '4.1', '3.7', '3.4', '3.3']
+    }
+)
+
+PYLINT_CONFIG = {}
+PYLINT_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'pylint_210',
+            'install': 'pip install -r requirements_test.txt',
+            'test_cmd': 'pytest -rA --color=no',
+        }
+        for k in [
+            '2.10',
+            '2.11',
+            '2.13',
+            '2.14',
+            '2.15',
+            '2.16',
+            '2.17',
+            '3.0',
+            '3.1',
+            '3.2',
+            '3.3',
+        ]
+    }
+)
+PYLINT_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'pylint_210',
+            'pre_install': [
+                r"sed -i 's/setuptools==[0-9.]\+/setuptools==58.0.0/' requirements_test_min.txt"
+            ],
+            'install': 'pip install -r requirements_test.txt',
+            'test_cmd': 'pytest -rA --color=no',
+        }
+        for k in ['3.0', '3.1', '3.2', '3.3']
+    }
+)
+
+ASTROPY_CONFIG = {}
+ASTROPY_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'astropy_11',
+            'install': 'python -m pip install -e .[test] --verbose',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['1.1', '1.2', '1.3', '2.0']
+    }
+)
+ASTROPY_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'astropy_30',
+            'pre_install': """echo '[pytest]
+filterwarnings =
+    ignore::DeprecationWarning' > pytest.ini""",
+            'install': 'python -m pip install -e .[test] --verbose',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['3.0', '3.1', '3.2']
+    }
+)
+ASTROPY_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'astropy_40',
+            'pre_install': [
+                r"""sed -i 's/requires = \["setuptools",/requires = \["setuptools==68.0.0",/' pyproject.toml"""
+            ],
+            'install': 'python -m pip install -e .[test] --verbose',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['4.0']
+    }
+)
+ASTROPY_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'astropy_41',
+            'pre_install': [
+                r"""sed -i 's/requires = \["setuptools",/requires = \["setuptools==68.0.0",/' pyproject.toml""",
+                """sed -i 's/^qt_no_exception_capture = 1$/; qt_no_exception_capture = 1/' setup.cfg""",
+                r"""sed -i '/setuptools==68.0.0",/a \    "markupsafe==2.0.1",' pyproject.tomlsed -i '/setuptools==68.0.0",/a \    "markupsafe==2.0.1",' pyproject.toml""",
+            ],
+            'install': 'python -m pip install -e .[test] --verbose',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['4.1']
+    }
+)
+ASTROPY_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'astropy_42',
+            'pre_install': [
+                r"""sed -i 's/requires = \["setuptools",/requires = \["setuptools==68.0.0",/' pyproject.toml""",
+                r"""sed -i '/setuptools==68.0.0",/a \    "markupsafe==2.0.1",' pyproject.tomlsed -i '/setuptools==68.0.0",/a \    "markupsafe==2.0.1",' pyproject.toml""",
+            ],
+            'install': 'python -m pip install -e .[test] --verbose',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['4.2', '4.3', '5.0', '5.1']
+    }
+)
+
+ASTROPY_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'astropy_52',
+            'pre_install': [
+                r"""sed -i 's/requires = \["setuptools",/requires = \["setuptools==68.0.0",/' pyproject.toml"""
+            ],
+            'install': 'python -m pip install -e .[test] --verbose',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['5.2', '5.3', '6.0', '6.1', '7.0']
+    }
+)
+
+DJANGO_CONFIG = {}
+DJANGO_CONFIG.update(
+    {
+        k: {
+            'install': 'pip install -e .',
+            'conda_env': 'django_22',
+            'test_cmd': 'python tests/runtests.py --verbosity 2',
+        }
+        for k in ['1.9', '2.2']
+    }
+)
+DJANGO_CONFIG.update(
+    {
+        '3.2': {
+            'install': 'pip install -e .',
+            'conda_env': 'django_32',
+            'test_cmd': 'python tests/runtests.py --verbosity 2',
+        },
+        '4.2': {
+            'install': 'pip install -e .',
+            'conda_env': 'django_42',
+            'test_cmd': 'python tests/runtests.py --verbosity 2',
+        },
+        '5.1': {
+            'install': 'pip install -e .',
+            'conda_env': 'django_51',
+            'test_cmd': 'python tests/runtests.py --verbosity 2',
+        },
+    }
+)
+SPHINX_CONFIG = {}
+SPHINX_CONFIG.update(
+    {  # 1.x 版本问题，实际无用
+        k: {
+            'conda_env': 'sphinx_20',
+            'install': 'python -m pip install -e .[test]',
+            'pre_install': ["sed -i 's/pytest/pytest -rA/' tox.ini"],
+            'test_cmd': 'tox --current-env -epy37 -v --',
+        }
+        for k in ['1.3', '1.4', '1.5', '1.6', '1.7', '1.8']
+    }
+)
+SPHINX_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'sphinx_20',
+            'install': 'python -m pip install -e .[test]',
+            'pre_install': [
+                "sed -i 's/pytest/pytest -rA/' tox.ini",
+                "sed -i 's/Jinja2>=2.3/Jinja2<3.0/' setup.py",
+            ],
+            'test_cmd': 'tox --current-env -epy37 -v --',
+        }
+        for k in ['2.0', '2.1', '2.2', '2.3', '2.4']
+    }
+)
+SPHINX_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'sphinx_30',
+            'install': 'python -m pip install -e .[test]',
+            'pre_install': [
+                "sed -i 's/pytest/pytest -rA/' tox.ini",
+                "sed -i 's/Jinja2>=2.3/Jinja2<3.0/' setup.py",
+                "sed -i 's/sphinxcontrib-applehelp/sphinxcontrib-applehelp<=1.0.7/' setup.py",
+                "sed -i 's/sphinxcontrib-devhelp/sphinxcontrib-devhelp<=1.0.5/' setup.py",
+                "sed -i 's/sphinxcontrib-qthelp/sphinxcontrib-qthelp<=1.0.6/' setup.py",
+                "sed -i 's/alabaster>=0.7,<0.8/alabaster>=0.7,<0.7.12/' setup.py",
+                "sed -i \"s/'packaging',/'packaging', 'markupsafe<=2.0.1',/\" setup.py",
+                "sed -i 's/sphinxcontrib-htmlhelp/sphinxcontrib-htmlhelp<=2.0.4/' setup.py",
+                "sed -i 's/sphinxcontrib-serializinghtml/sphinxcontrib-serializinghtml<=1.1.9/' setup.py",
+            ],
+            'test_cmd': 'tox --current-env -epy37 -v --',
+        }
+        for k in ['3.0', '3.1', '3.2', '3.3', '3.4', '3.5', '4.0']
+    }
+)
+SPHINX_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'sphinx_30',
+            'install': 'python -m pip install -e .[test]',
+            'pre_install': [
+                "sed -i 's/pytest/pytest -rA/' tox.ini",
+                "sed -i 's/Jinja2>=2.3/Jinja2<3.0/' setup.py",
+                "sed -i 's/sphinxcontrib-applehelp/sphinxcontrib-applehelp<=1.0.7/' setup.py",
+                "sed -i 's/sphinxcontrib-devhelp/sphinxcontrib-devhelp<=1.0.5/' setup.py",
+                "sed -i 's/sphinxcontrib-qthelp/sphinxcontrib-qthelp<=1.0.6/' setup.py",
+                "sed -i 's/alabaster>=0.7,<0.8/alabaster>=0.7,<0.7.12/' setup.py",
+                "sed -i \"s/'packaging',/'packaging', 'markupsafe<=2.0.1',/\" setup.py",
+                (
+                    "grep -q 'sphinxcontrib-htmlhelp>=2.0.0' setup.py && "
+                    "sed -i 's/sphinxcontrib-htmlhelp>=2.0.0/sphinxcontrib-htmlhelp>=2.0.0,<=2.0.4/' setup.py || "
+                    "sed -i 's/sphinxcontrib-htmlhelp/sphinxcontrib-htmlhelp<=2.0.4/' setup.py"
+                ),
+                (
+                    "grep -q 'sphinxcontrib-serializinghtml>=1.1.5' setup.py && "
+                    "sed -i 's/sphinxcontrib-serializinghtml>=1.1.5/sphinxcontrib-serializinghtml>=1.1.5,<=1.1.9/' setup.py || "
+                    "sed -i 's/sphinxcontrib-serializinghtml/sphinxcontrib-serializinghtml<=1.1.9/' setup.py"
+                ),
+            ],
+            'test_cmd': 'tox --current-env -epy37 -v --',
+        }
+        for k in ['4.1']
+    }
+)
+SPHINX_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'sphinx_30',
+            'install': 'python -m pip install -e .[test]',
+            'pre_install': [
+                "sed -i 's/pytest/pytest -rA/' tox.ini",
+                "sed -i 's/Jinja2>=2.3/Jinja2<3.0/' setup.py",
+                "sed -i 's/sphinxcontrib-applehelp/sphinxcontrib-applehelp<=1.0.7/' setup.py",
+                "sed -i 's/sphinxcontrib-devhelp/sphinxcontrib-devhelp<=1.0.5/' setup.py",
+                "sed -i 's/sphinxcontrib-qthelp/sphinxcontrib-qthelp<=1.0.6/' setup.py",
+                "sed -i 's/alabaster>=0.7,<0.8/alabaster>=0.7,<0.7.12/' setup.py",
+                "sed -i \"s/'packaging',/'packaging', 'markupsafe<=2.0.1',/\" setup.py",
+                "sed -i 's/sphinxcontrib-htmlhelp>=2.0.0/sphinxcontrib-htmlhelp>=2.0.0,<=2.0.4/' setup.py",
+                "sed -i 's/sphinxcontrib-serializinghtml>=1.1.5/sphinxcontrib-serializinghtml>=1.1.5,<=1.1.9/' setup.py",
+            ],
+            'test_cmd': 'tox --current-env -epy37 -v --',
+        }
+        for k in ['4.2', '4.3', '4.4']
+    }
+)
+SPHINX_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'sphinx_30',
+            'install': 'python -m pip install -e .[test]',
+            'pre_install': [
+                "sed -i 's/pytest/pytest -rA/' tox.ini",
+            ],
+            'test_cmd': 'tox --current-env -epy37 -v --',
+        }
+        for k in ['4.5', '5.0', '5.1', '5.2']
+    }
+)
+SPHINX_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'sphinx_60',
+            'install': 'python -m pip install -e .[test]',
+            'pre_install': [
+                "sed -i 's/pytest/pytest -rA/' tox.ini",
+            ],
+            'test_cmd': 'tox --current-env -epy39 -v --',
+        }
+        for k in ['6.0', '6.2', '7.0', '7.1']
+    }
+)
+SPHINX_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'sphinx_72',
+            'install': 'python -m pip install -e .[test]',
+            'pre_install': [
+                "sed -i 's/pytest/pytest -rA/' tox.ini",
+                'apt-get update && apt-get install -y graphviz',
+            ],
+            'test_cmd': 'tox --current-env -epy39 -v --',
+        }
+        for k in ['7.2', '7.3', '7.4']
+    }
+)
+SPHINX_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'sphinx_80',
+            'install': 'python -m pip install -e .[test]',
+            'pre_install': [
+                "sed -i 's/pytest/pytest -rA/' tox.ini",
+            ],
+            'test_cmd': 'tox --current-env -epy310 -v --',
+        }
+        for k in ['8.0', '8.1']
+    }
+)
+
+
+SKLEARN_CONFIG = {}
+SKLEARN_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'skl_020',
+            'install': 'pip install -v --no-use-pep517 --no-build-isolation -e .',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['0.20', '0.21', '0.22']
+    }
+)
+SKLEARN_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'skl_100',
+            'install': 'pip install -v --no-use-pep517 --no-build-isolation -e .',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['0.23', '0.24', '1.00', '1.01', '1.02']
+    }
+)
+SKLEARN_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'skl_104',
+            'install': 'pip install -v --no-use-pep517 --no-build-isolation -e .',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['1.03', '1.04', '1.05']
+    }
+)
+
+SEABORN_CONFIG = {}
+SEABORN_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'seaborn_010',
+            'install': 'pip install -e .[dev]',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['0.3', '0.4', '0.5', '0.6', '0.11', '0.12', '0.13', '0.14']
+    }
+)
+
+XARRAY_CONFIG = {}
+XARRAY_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'xarray_0014',
+            'install': 'pip install -e .',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['0014', '0015', '0016']
+    }
+)
+XARRAY_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'xarray_0017',
+            'install': 'pip install -e .',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['0017', '0018', '0019', '0020', '0021']
+    }
+)
+XARRAY_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'xarray_2203',
+            'install': 'pip install -e .',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['2203', '2206', '2209', '2210', '2211', '2212']
+    }
+)
+XARRAY_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'xarray_2303',
+            'install': 'pip install -e .',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in [
+            '2303',
+            '2304',
+            '2305',
+            '2306',
+            '2308',
+            '2309',
+            '2310',
+            '2311',
+            '2312',
+        ]
+    }
+)
+XARRAY_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'xarray_2401',
+            'install': 'pip install -e .',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['2401', '2402', '2403', '2405', '2407', '2409', '2410', '2411']
+    }
+)
+
+SKLEARN_CONFIG = {}
+SKLEARN_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'skl_020',
+            'install': 'pip install -v --no-use-pep517 --no-build-isolation -e .',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['0.20', '0.21', '0.22']
+    }
+)
+SKLEARN_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'skl_100',
+            'install': 'pip install -v --no-use-pep517 --no-build-isolation -e .',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['0.23', '0.24', '1.00', '1.01', '1.02']
+    }
+)
+SKLEARN_CONFIG.update(
+    {
+        k: {
+            'conda_env': 'skl_104',
+            'install': 'pip install -v --no-use-pep517 --no-build-isolation -e .',
+            'test_cmd': 'pytest --color=no -rA',
+        }
+        for k in ['1.03', '1.04', '1.05', '1.06', '1.07']
+    }
+)
+
+
+MAP_REPO_TO_CONFIG = {
+    'pydata/xarray': XARRAY_CONFIG,
+    'mwaskom/seaborn': SEABORN_CONFIG,
+    'scikit-learn/scikit-learn': SKLEARN_CONFIG,
+    'sphinx-doc/sphinx': SPHINX_CONFIG,
+    'django/django': DJANGO_CONFIG,
+    'astropy/astropy': ASTROPY_CONFIG,
+    'pylint-dev/pylint': PYLINT_CONFIG,
+    'pytest-dev/pytest': PYTEST_CONFIG,
+    'psf/requests': REQUESTS_CONFIG,
+    'sympy/sympy': SYMPY_CONFIG,
+    'matplotlib/matplotlib': MATPLOTLIB_CONFIG,
+}
--- a/evaluation/benchmarks/nocode_bench/prompts/nc.j2
+++ b/evaluation/benchmarks/nocode_bench/prompts/nc.j2
@ -0,0 +1,65 @@
+<uploaded_files>
+/workspace/{{ workspace_dir_name }}
+</uploaded_files>
+
+I've uploaded a python code repository in the directory {{ workspace_dir_name }}. Consider the following issue description:
+
+<doc_change>
+{{ instance.problem_statement }}
+</doc_change>
+
+Can you help me add the new features to the repository based on the changes in the <doc_change>?
+I've already taken care of all changes to any of the test files described in the <doc_change>. This means you DON'T have to modify the testing logic or any of the tests in any way!
+Also the development Python environment is already set up for you (i.e., all dependencies already installed), so you don't need to install other packages.
+Your task is to make the minimal changes to non-test files in the /workspace/{{ workspace_dir_name }} directory to implement the new features required by the documentation updates.
+
+Follow these phases to resolve the issue:
+
+Phase 1. READING: read the requirements and reword it in clearer terms
+   1.1 If there are code or config snippets. Express in words any best practices or conventions in them.
+   1.2 Hightlight method names, variables, file names, stack traces, and technical details, particularly those related to new features.
+   1.3 Explain the new feature requirements in clear terms.
+   1.4 Specify functional scope and expected behavior of new features.
+   1.5 Hightlight any best practices to take into account when developing and testing the new feature.
+
+Phase 2. RUNNING: install and run the functionality in the repository to validate the new features
+   2.1 Follow the readme.
+   2.2 Install the environment and anything needed.
+   2.2 Iterate and figure out how to validate the newly added features.
+
+Phase 3. EXPLORATION: find the files related to the new features and possible implementation solutions
+   3.1 Use `grep` to search for relevant methods, classes, keywords and feature requirements.
+   3.2 Identify all files related to the new features.
+   3.3 Propose the methods and files to implement the new features and explain why.
+   3.4 From the possible file locations, select the most likely location to implement the new features.
+
+Phase 4. TEST CREATION: before implementing any new features, create a script to validate the feature's correctness.
+   4.1 Look at existing test files in the repository to understand the test format/structure.
+   4.2 Create a minimal validation script to verify the newly added features.
+   4.3 Run the validation script to confirm the new features are successfully added and working as expected.
+   4.4 Adjust the validation script as necessary to ensure the new features fully meet the requirements.
+
+Phase 5. FEATURE ANALYSIS: state clearly the new feature and how to implement it
+   5.1 State clearly what the new feature is.
+   5.2 State clearly where the feature should be implemented.
+   5.3 State clearly how the test validates the new feature.
+   5.4 State clearly the best practices to take into account when implementing the new feature.
+   5.5 State clearly how to implement the new feature.
+
+Phase 6. FEATURE IMPLEMENTATION: edit the source code to implement your chosen solution for the new feature
+   6.1 Make minimal, focused changes to implement the new feature.
+
+Phase 7. VERIFICATION: Test your new feature thoroughly.
+   7.1 Run your validation script to verify the new feature works as expected.
+   7.2 Add edge cases to your test script to ensure comprehensive coverage of the new feature.
+   7.3 Run existing tests related to the modified code to ensure you haven't broken anything.
+
+Phase 8. FINAL REVIEW: Carefully re-read the feature requirements and compare your changes with the base commit {{ instance.base_commit }}
+   8.1 Ensure you've fully implemented all required features.
+   8.2 Run any tests in the repository related to:
+     8.2.1 The new features you are adding
+     8.2.2 The files you modified
+     8.2.3 The functions you changed
+   8.3 If any tests fail, revise your implementation until all tests pass and the new feature works as expected.
+
+Be thorough in your exploration, testing, and reasoning. It's fine if your thinking process is lengthy - quality and completeness are more important than brevity.
--- a/evaluation/benchmarks/nocode_bench/resource/mapping.py
+++ b/evaluation/benchmarks/nocode_bench/resource/mapping.py
@ -0,0 +1,39 @@
+"""Mapping instance_id to resource_factor.
+
+Different instances may have different resource requirements.
+e.g., some instances may require more memory/CPU to run inference.
+This file tracks the resource requirements of different instances.
+"""
+
+import json
+import os
+
+from openhands.core.logger import openhands_logger as logger
+
+CUR_DIR = os.path.dirname(os.path.abspath(__file__))
+DEFAULT_RUNTIME_RESOURCE_FACTOR = int(
+    os.environ.get('DEFAULT_RUNTIME_RESOURCE_FACTOR', 1)
+)
+
+# dataset to resource mapping
+_global_resource_mapping: dict[str, dict[str, float]] = {}
+
+
+def get_resource_mapping(dataset_name: str) -> dict[str, float]:
+    if dataset_name not in _global_resource_mapping:
+        file_path = os.path.join(CUR_DIR, f'{dataset_name}.json')
+        if not os.path.exists(file_path):
+            logger.info(f'Resource mapping for {dataset_name} not found.')
+            return None
+
+        with open(file_path, 'r') as f:
+            _global_resource_mapping[dataset_name] = json.load(f)
+        logger.debug(f'Loaded resource mapping for {dataset_name}')
+    return _global_resource_mapping[dataset_name]
+
+
+def get_instance_resource_factor(dataset_name: str, instance_id: str) -> int:
+    resource_mapping = get_resource_mapping(dataset_name)
+    if resource_mapping is None:
+        return DEFAULT_RUNTIME_RESOURCE_FACTOR
+    return int(resource_mapping.get(instance_id, DEFAULT_RUNTIME_RESOURCE_FACTOR))
--- a/evaluation/benchmarks/nocode_bench/run_infer_nc.py
+++ b/evaluation/benchmarks/nocode_bench/run_infer_nc.py
@ -0,0 +1,909 @@
+import asyncio
+import copy
+import json
+import os
+import tempfile
+from typing import Any, Literal
+
+import numpy as np
+import pandas as pd
+import toml
+from datasets import load_dataset
+from jinja2 import Environment, FileSystemLoader
+
+import openhands.agenthub
+from evaluation.benchmarks.nocode_bench.binary_patch_utils import (
+    remove_binary_diffs,
+    remove_binary_files_from_git,
+)
+from evaluation.benchmarks.nocode_bench.consistants import MAP_REPO_TO_CONFIG
+from evaluation.benchmarks.nocode_bench.resource.mapping import (
+    get_instance_resource_factor,
+)
+from evaluation.benchmarks.nocode_bench.scripts.utils.evaluation_utils import (
+    run_evaluation_nocode_bench,
+)
+from evaluation.utils.shared import (
+    EvalException,
+    EvalMetadata,
+    EvalOutput,
+    assert_and_raise,
+    codeact_user_response,
+    get_default_sandbox_config_for_eval,
+    get_metrics,
+    is_fatal_evaluation_error,
+    make_metadata,
+    prepare_dataset,
+    reset_logger_for_multiprocessing,
+    update_llm_config_for_completions_logging,
+)
+from openhands.controller.state.state import State
+from openhands.core.config import (
+    AgentConfig,
+    OpenHandsConfig,
+    get_evaluation_parser,
+    get_llm_config_arg,
+)
+from openhands.core.config.condenser_config import NoOpCondenserConfig
+from openhands.core.config.utils import get_condenser_config_arg
+from openhands.core.logger import openhands_logger as logger
+from openhands.core.main import create_runtime, run_controller
+from openhands.critic import AgentFinishedCritic
+from openhands.events.action import CmdRunAction, FileReadAction, MessageAction
+from openhands.events.observation import (
+    CmdOutputObservation,
+    ErrorObservation,
+    FileReadObservation,
+)
+from openhands.events.serialization.event import event_from_dict, event_to_dict
+from openhands.runtime.base import Runtime
+from openhands.utils.async_utils import call_async_from_sync
+from openhands.utils.shutdown_listener import sleep_if_should_continue
+
+USE_HINT_TEXT = os.environ.get('USE_HINT_TEXT', 'false').lower() == 'true'
+RUN_WITH_BROWSING = os.environ.get('RUN_WITH_BROWSING', 'false').lower() == 'true'
+ENABLE_LLM_EDITOR = os.environ.get('ENABLE_LLM_EDITOR', 'false').lower() == 'true'
+BenchMode = Literal['swe', 'swt', 'swt-ci']
+
+# Global variable to track dataset type
+DATASET_TYPE = 'nc_bench'
+
+
+def set_dataset_type(dataset_name: str) -> str:
+    """Set dataset type based on dataset name."""
+    global DATASET_TYPE
+    DATASET_TYPE = 'nc_bench'
+
+    logger.info(f'Dataset type set to: {DATASET_TYPE}')
+
+
+AGENT_CLS_TO_FAKE_USER_RESPONSE_FN = {
+    'CodeActAgent': codeact_user_response,
+}
+
+
+def _get_swebench_workspace_dir_name(instance: pd.Series) -> str:
+    return f'{instance.repo.split("/")[-1]}'
+
+
+def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageAction:
+    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
+    metadata.details['mode']
+
+    # Determine the template file based on mode and LLM
+
+    template_name = 'nc.j2'
+
+    # Set up Jinja2 environment
+    # Assuming templates are in 'evaluation/benchmarks/swe_bench/prompts' relative to this script
+    prompts_dir = os.path.join(os.path.dirname(__file__), 'prompts')
+    env = Environment(loader=FileSystemLoader(prompts_dir))
+    template = env.get_template(template_name)
+
+    # Prepare context for rendering
+    context = {
+        'instance': instance,
+        'workspace_dir_name': workspace_dir_name,
+        'metadata': metadata,  # Pass metadata if needed in templates
+    }
+
+    context['test_instructions'] = ''  # Ensure it's defined for other modes
+
+    # Render the instruction
+    instruction = template.render(context)
+
+    if RUN_WITH_BROWSING:
+        instruction += (
+            '<IMPORTANT!>\nYou SHOULD NEVER attempt to browse the web. </IMPORTANT!>\n'
+        )
+
+    if 'image_assets' in instance:
+        assets = json.loads(instance['image_assets'])
+        assert 'problem_statement' in assets, (
+            'problem_statement is required in image_assets'
+        )
+        image_urls = assets['problem_statement']
+        return MessageAction(content=instruction, image_urls=image_urls)
+    return MessageAction(content=instruction)
+
+
+DEFAULT_DOCKER_IMAGE_PREFIX = os.environ.get(
+    'EVAL_DOCKER_IMAGE_PREFIX', 'docker.io/xingyaoww/'
+)
+logger.info(f'Default docker image prefix: {DEFAULT_DOCKER_IMAGE_PREFIX}')
+
+
+def get_instance_docker_image(
+    instance_id: str,
+    swebench_official_image: bool = False,
+) -> str:
+    if swebench_official_image:
+        # Official NoCode-Bench image
+        image_name = f'ncbench_{instance_id}:latest'.lower()
+        logger.debug(f'Using official NoCode-Bench image: {image_name}')
+        return image_name
+    else:
+        raise
+
+
+def get_config(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+) -> OpenHandsConfig:
+    # We use a different instance image for the each instance of NoCode-bench eval
+    use_swebench_official_image = True
+
+    base_container_image = get_instance_docker_image(
+        instance['instance_id'],
+        swebench_official_image=use_swebench_official_image,
+    )
+    logger.info(
+        f'Using instance container image: {base_container_image}. '
+        f'Please make sure this image exists. '
+        f'Submit an issue on https://github.com/All-Hands-AI/OpenHands if you run into any issues.'
+    )
+
+    sandbox_config = get_default_sandbox_config_for_eval()
+    sandbox_config.base_container_image = base_container_image
+    sandbox_config.enable_auto_lint = True
+    sandbox_config.use_host_network = False
+    # Add platform to the sandbox config to solve issue 4401
+    sandbox_config.platform = 'linux/amd64'
+    sandbox_config.remote_runtime_resource_factor = get_instance_resource_factor(
+        dataset_name=metadata.dataset,
+        instance_id=instance['instance_id'],
+    )
+
+    config = OpenHandsConfig(
+        default_agent=metadata.agent_class,
+        run_as_openhands=False,
+        max_iterations=metadata.max_iterations,
+        runtime=os.environ.get('RUNTIME', 'docker'),
+        sandbox=sandbox_config,
+        # do not mount workspace
+        workspace_base=None,
+        workspace_mount_path=None,
+    )
+
+    config.set_llm_config(
+        update_llm_config_for_completions_logging(
+            metadata.llm_config, metadata.eval_output_dir, instance['instance_id']
+        )
+    )
+    # get 'draft_editor' config if exists
+    config.set_llm_config(get_llm_config_arg('draft_editor'), 'draft_editor')
+
+    agent_config = AgentConfig(
+        enable_jupyter=False,
+        enable_browsing=RUN_WITH_BROWSING,
+        enable_llm_editor=ENABLE_LLM_EDITOR,
+        enable_mcp=False,
+        condenser=metadata.condenser_config,
+        enable_prompt_extensions=False,
+    )
+    config.set_agent_config(agent_config)
+    return config
+
+
+def make_serializable(obj):
+    if isinstance(obj, pd.Series):
+        obj = obj.to_dict()
+    if isinstance(obj, dict):
+        return {k: make_serializable(v) for k, v in obj.items()}
+    elif isinstance(obj, list):
+        return [make_serializable(v) for v in obj]
+    elif isinstance(obj, tuple):
+        return tuple(make_serializable(v) for v in obj)
+    elif isinstance(obj, np.ndarray):
+        return obj.tolist()
+    elif isinstance(obj, pd.Timestamp):
+        return str(obj)
+    else:
+        return obj
+
+
+def initialize_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required
+    metadata: EvalMetadata,
+):
+    """Initialize the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    """
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Initialization Fn')
+    logger.info('-' * 30)
+    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
+    obs: CmdOutputObservation
+
+    # Set instance id and git configuration
+    action = CmdRunAction(
+        command=f"""echo 'export SWE_INSTANCE_ID={instance['instance_id']}' >> ~/.bashrc && echo 'export PIP_CACHE_DIR=~/.cache/pip' >> ~/.bashrc && echo "alias git='git --no-pager'" >> ~/.bashrc && git config --global core.pager "" && git config --global diff.binary false"""
+    )
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0,
+        f'Failed to export SWE_INSTANCE_ID and configure git: {str(obs)}',
+    )
+
+    action = CmdRunAction(command="""export USER=$(whoami); echo USER=${USER} """)
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(obs.exit_code == 0, f'Failed to export USER: {str(obs)}')
+
+    # inject the init script
+    script_dir = os.path.dirname(__file__)
+
+    # inject the instance info
+    action = CmdRunAction(command='mkdir -p /swe_util/eval_data/instances')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0,
+        f'Failed to create /swe_util/eval_data/instances: {str(obs)}',
+    )
+
+    swe_instance_json_name = 'swe-bench-instance.json'
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Construct the full path for the desired file name within the temporary directory
+        temp_file_path = os.path.join(temp_dir, swe_instance_json_name)
+        # Write to the file with the desired name within the temporary directory
+
+        with open(temp_file_path, 'w') as f:
+            if not isinstance(instance, dict):
+                instance_dict = make_serializable(instance)
+            else:
+                instance_dict = dict(instance)
+
+            if DATASET_TYPE == 'nc_bench':
+                config = MAP_REPO_TO_CONFIG.get(instance['repo'], {}).get(
+                    instance['version'], []
+                )
+                docker_conda_env_name = config['conda_env']
+                instance_dict['conda_env'] = docker_conda_env_name
+
+            json.dump([instance_dict], f)
+
+        # Copy the file to the desired location
+        runtime.copy_to(temp_file_path, '/swe_util/eval_data/instances/')
+
+        # inject the instance swe entry
+        entry_script_path = 'instance_nc_entry.sh'
+
+        runtime.copy_to(
+            str(os.path.join(script_dir, f'scripts/setup/{entry_script_path}')),
+            '/swe_util/',
+        )
+
+    action = CmdRunAction(command='cat ~/.bashrc')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(obs.exit_code == 0, f'Failed to cat ~/.bashrc: {str(obs)}')
+
+    action = CmdRunAction(command='source ~/.bashrc')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    if isinstance(obs, ErrorObservation):
+        logger.error(f'Failed to source ~/.bashrc: {str(obs)}')
+    assert_and_raise(obs.exit_code == 0, f'Failed to source ~/.bashrc: {str(obs)}')
+
+    action = CmdRunAction(command=f'source /swe_util/{entry_script_path}')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0,
+        f'Failed to source /swe_util/{entry_script_path}: {str(obs)}',
+    )
+
+    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        obs.exit_code == 0,
+        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
+    )
+
+    action = CmdRunAction(command='git reset --hard')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(obs.exit_code == 0, f'Failed to git reset --hard: {str(obs)}')
+
+    action = CmdRunAction(
+        command='for remote_name in $(git remote); do git remote remove "${remote_name}"; done'
+    )
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
+
+    if DATASET_TYPE != 'Multimodal' and DATASET_TYPE != 'SWE-bench-Live':
+        # Only for non-multimodal datasets, we need to activate the testbed environment for Python
+        # SWE-Bench multimodal datasets and SWE-bench-Live are not using the testbed environment
+        action = CmdRunAction(command='which python')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(
+            obs.exit_code == 0,
+            f'Expected to find python interpreter, but got: {str(obs)}',
+        )
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Initialization Fn')
+    logger.info('-' * 30)
+
+
+def complete_runtime(
+    runtime: Runtime,
+    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
+) -> dict[str, Any]:
+    """Complete the runtime for the agent.
+
+    This function is called before the runtime is used to run the agent.
+    If you need to do something in the sandbox to get the correctness metric after
+    the agent has run, modify this function.
+    """
+    logger.info('-' * 30)
+    logger.info('BEGIN Runtime Completion Fn')
+    logger.info('-' * 30)
+    obs: CmdOutputObservation
+    workspace_dir_name = _get_swebench_workspace_dir_name(instance)
+
+    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    if obs.exit_code == -1:
+        # The previous command is still running
+        # We need to kill previous command
+        logger.info('The previous command is still running, trying to kill it...')
+        action = CmdRunAction(command='C-c')
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Then run the command again
+        action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    if obs.exit_code == -1:
+        # The previous command is still running
+        # We need to kill previous command
+        logger.info('The previous command is still running, trying to ctrl+z it...')
+        action = CmdRunAction(command='C-z')
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+        # Then run the command again
+        action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
+    )
+
+    action = CmdRunAction(command='git config --global core.pager ""')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to git config --global core.pager "": {str(obs)}',
+    )
+
+    # First check for any git repositories in subdirectories
+    action = CmdRunAction(command='find . -type d -name .git -not -path "./.git"')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to find git repositories: {str(obs)}',
+    )
+
+    git_dirs = [p for p in obs.content.strip().split('\n') if p]
+    if git_dirs:
+        # Remove all .git directories in subdirectories
+        for git_dir in git_dirs:
+            action = CmdRunAction(command=f'rm -rf "{git_dir}"')
+            action.set_hard_timeout(600)
+            logger.info(action, extra={'msg_type': 'ACTION'})
+            obs = runtime.run_action(action)
+            logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+            assert_and_raise(
+                isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+                f'Failed to remove git directory {git_dir}: {str(obs)}',
+            )
+
+    # add all files
+    action = CmdRunAction(command='git add -A')
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to git add -A: {str(obs)}',
+    )
+
+    # Remove binary files from git staging
+    action = CmdRunAction(command=remove_binary_files_from_git())
+    action.set_hard_timeout(600)
+    logger.info(action, extra={'msg_type': 'ACTION'})
+    obs = runtime.run_action(action)
+    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+    assert_and_raise(
+        isinstance(obs, CmdOutputObservation) and obs.exit_code == 0,
+        f'Failed to remove binary files: {str(obs)}',
+    )
+
+    n_retries = 0
+    git_patch = None
+    while n_retries < 5:
+        action = CmdRunAction(
+            command=f'git diff --no-color --cached {instance["base_commit"]} > patch.diff'
+        )
+        action.set_hard_timeout(max(300 + 100 * n_retries, 600))
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        n_retries += 1
+        if isinstance(obs, CmdOutputObservation):
+            if obs.exit_code == 0:
+                # Read the patch file
+                action = FileReadAction(path='patch.diff')
+                action.set_hard_timeout(max(300 + 100 * n_retries, 600))
+                logger.info(action, extra={'msg_type': 'ACTION'})
+                obs = runtime.run_action(action)
+                logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+                if isinstance(obs, FileReadObservation):
+                    git_patch = obs.content
+                    break
+                elif isinstance(obs, ErrorObservation):
+                    # Fall back to cat "patch.diff" to get the patch
+                    assert 'File could not be decoded as utf-8' in obs.content
+                    action = CmdRunAction(command='cat patch.diff')
+                    action.set_hard_timeout(max(300 + 100 * n_retries, 600))
+                    logger.info(action, extra={'msg_type': 'ACTION'})
+                    obs = runtime.run_action(action)
+                    assert isinstance(obs, CmdOutputObservation) and obs.exit_code == 0
+                    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+                    git_patch = obs.content
+                    break
+                else:
+                    assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
+            else:
+                logger.info('Failed to get git diff, retrying...')
+                sleep_if_should_continue(10)
+        elif isinstance(obs, ErrorObservation):
+            logger.error(f'Error occurred: {obs.content}. Retrying...')
+            sleep_if_should_continue(10)
+        else:
+            assert_and_raise(False, f'Unexpected observation type: {str(obs)}')
+
+    assert_and_raise(git_patch is not None, 'Failed to get git diff (None)')
+
+    # Remove binary diffs from the patch
+    git_patch = remove_binary_diffs(git_patch)
+
+    logger.info('-' * 30)
+    logger.info('END Runtime Completion Fn')
+    logger.info('-' * 30)
+    return {'git_patch': git_patch}
+
+
+def process_instance(
+    instance: pd.Series,
+    metadata: EvalMetadata,
+    reset_logger: bool = True,
+    runtime_failure_count: int = 0,
+) -> EvalOutput:
+    config = get_config(instance, metadata)
+
+    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
+    if reset_logger:
+        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
+        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
+    else:
+        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
+
+    # Increase resource_factor with increasing attempt_id
+    if runtime_failure_count > 0:
+        config.sandbox.remote_runtime_resource_factor = min(
+            config.sandbox.remote_runtime_resource_factor * (2**runtime_failure_count),
+            8,
+        )
+        logger.warning(
+            f'This is the {runtime_failure_count + 1}th attempt for instance {instance.instance_id}, setting resource factor to {config.sandbox.remote_runtime_resource_factor}'
+        )
+
+    metadata = copy.deepcopy(metadata)
+    metadata.details['runtime_failure_count'] = runtime_failure_count
+    metadata.details['remote_runtime_resource_factor'] = (
+        config.sandbox.remote_runtime_resource_factor
+    )
+
+    runtime = create_runtime(config)
+    call_async_from_sync(runtime.connect)
+
+    try:
+        initialize_runtime(runtime, instance, metadata)
+
+        message_action = get_instruction(instance, metadata)
+
+        # Here's how you can run the agent (similar to the `main` function) and get the final task state
+        state: State | None = asyncio.run(
+            run_controller(
+                config=config,
+                initial_user_action=message_action,
+                runtime=runtime,
+                fake_user_response_fn=AGENT_CLS_TO_FAKE_USER_RESPONSE_FN[
+                    metadata.agent_class
+                ],
+            )
+        )
+
+        # if fatal error, throw EvalError to trigger re-run
+        if is_fatal_evaluation_error(state.last_error):
+            raise EvalException('Fatal error detected: ' + state.last_error)
+
+        # ======= THIS IS SWE-Bench specific =======
+        # Get git patch
+        if DATASET_TYPE == 'SWE-bench-Live':
+            from evaluation.benchmarks.swe_bench.live_utils import (
+                complete_runtime as complete_runtime_fn,
+            )
+        else:
+            complete_runtime_fn = complete_runtime
+        return_val = complete_runtime_fn(runtime, instance)
+        git_patch = return_val['git_patch']
+        logger.info(
+            f'Got git diff for instance {instance.instance_id}:\n--------\n{git_patch}\n--------'
+        )
+    finally:
+        runtime.close()
+    # ==========================================
+
+    # ======= Attempt to evaluate the agent's edits =======
+    # we use eval_infer.sh to evaluate the agent's edits, not here
+    # because the agent may alter the environment / testcases
+    test_result = {
+        'git_patch': git_patch,
+    }
+
+    # If you are working on some simpler benchmark that only evaluates the final model output (e.g., in a MessageAction)
+    # You can simply get the LAST `MessageAction` from the returned `state.history` and parse it for evaluation.
+    if state is None:
+        raise ValueError('State should not be None.')
+
+    # NOTE: this is NO LONGER the event stream, but an agent history that includes delegate agent's events
+    histories = [event_to_dict(event) for event in state.history]
+    metrics = get_metrics(state)
+
+    # Save the output
+    instruction = message_action.content
+    if message_action.image_urls:
+        instruction += (
+            '\n\n<image_urls>' + '\n'.join(message_action.image_urls) + '</image_urls>'
+        )
+    output = EvalOutput(
+        instance_id=instance.instance_id,
+        instruction=instruction,
+        instance=instance.to_dict(),  # SWE Bench specific
+        test_result=test_result,
+        metadata=metadata,
+        history=histories,
+        metrics=metrics,
+        error=state.last_error if state and state.last_error else None,
+    )
+    return output
+
+
+def filter_dataset(dataset: pd.DataFrame, filter_column: str) -> pd.DataFrame:
+    file_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'config.toml')
+    if os.path.exists(file_path):
+        with open(file_path, 'r') as file:
+            data = toml.load(file)
+            if 'selected_ids' in data:
+                selected_ids = data['selected_ids']
+                logger.info(
+                    f'Filtering {len(selected_ids)} tasks from "selected_ids"...'
+                )
+                subset = dataset[dataset[filter_column].isin(selected_ids)]
+                logger.info(f'Retained {subset.shape[0]} tasks after filtering')
+                return subset
+            if 'selected_repos' in data:
+                # repos for the swe-bench instances:
+                # ['astropy/astropy', 'django/django', 'matplotlib/matplotlib', 'mwaskom/seaborn', 'pallets/flask', 'psf/requests', 'pydata/xarray', 'pylint-dev/pylint', 'pytest-dev/pytest', 'scikit-learn/scikit-learn', 'sphinx-doc/sphinx', 'sympy/sympy']
+                selected_repos = data['selected_repos']
+                if isinstance(selected_repos, str):
+                    selected_repos = [selected_repos]
+                assert isinstance(selected_repos, list)
+                logger.info(
+                    f'Filtering {selected_repos} tasks from "selected_repos"...'
+                )
+                subset = dataset[dataset['repo'].isin(selected_repos)]
+                logger.info(f'Retained {subset.shape[0]} tasks after filtering')
+                return subset
+
+    skip_ids = os.environ.get('SKIP_IDS', '').split(',')
+    if len(skip_ids) > 0:
+        logger.info(f'Filtering {len(skip_ids)} tasks from "SKIP_IDS"...')
+        return dataset[~dataset[filter_column].isin(skip_ids)]
+    return dataset
+
+
+if __name__ == '__main__':
+    parser = get_evaluation_parser()
+    parser.add_argument(
+        '--dataset',
+        type=str,
+        default='NoCode-bench/NoCode-bench_Verified',
+        help='data set to evaluate on, either full-test or lite-test',
+    )
+    parser.add_argument(
+        '--split',
+        type=str,
+        default='test',
+        help='split to evaluate on',
+    )
+    parser.add_argument(
+        '--mode',
+        type=str,
+        default='swe',
+        choices=['swe', 'swt', 'swt-ci'],
+        help="mode to run the evaluation, either 'swe', 'swt', or 'swt-ci'",
+    )
+
+    args, _ = parser.parse_known_args()
+
+    # NOTE: It is preferable to load datasets from huggingface datasets and perform post-processing
+    # so we don't need to manage file uploading to OpenHands's repo
+
+    dataset = load_dataset(args.dataset, args.split)
+
+    # Set the global dataset type based on dataset name
+    set_dataset_type(args.dataset)
+
+    swe_bench_tests = filter_dataset(dataset.to_pandas(), 'instance_id')
+    logger.info(
+        f'Loaded dataset {args.dataset} with split {args.split}: {len(swe_bench_tests)} tasks'
+    )
+
+    llm_config = None
+    if args.llm_config:
+        llm_config = get_llm_config_arg(args.llm_config)
+        llm_config.log_completions = True
+        # modify_params must be False for evaluation purpose, for reproducibility and accurancy of results
+        llm_config.modify_params = False
+
+    if llm_config is None:
+        raise ValueError(f'Could not find LLM config: --llm_config {args.llm_config}')
+
+    # Get condenser config from environment variable
+    condenser_name = os.environ.get('EVAL_CONDENSER')
+    if condenser_name:
+        condenser_config = get_condenser_config_arg(condenser_name)
+        if condenser_config is None:
+            raise ValueError(
+                f'Could not find Condenser config: EVAL_CONDENSER={condenser_name}'
+            )
+    else:
+        # If no specific condenser config is provided via env var, default to NoOpCondenser
+        condenser_config = NoOpCondenserConfig()
+        logger.debug(
+            'No Condenser config provided via EVAL_CONDENSER, using NoOpCondenser.'
+        )
+
+    details = {'mode': args.mode}
+    _agent_cls = openhands.agenthub.Agent.get_cls(args.agent_cls)
+
+    dataset_descrption = (
+        args.dataset.replace('/', '__') + '-' + args.split.replace('/', '__')
+    )
+    metadata = make_metadata(
+        llm_config,
+        dataset_descrption,
+        args.agent_cls,
+        args.max_iterations,
+        args.eval_note,
+        args.eval_output_dir,
+        details=details,
+        condenser_config=condenser_config,
+    )
+
+    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
+    print(f'### OUTPUT FILE: {output_file} ###')
+
+    # Run evaluation in iterative mode:
+    # If a rollout fails to output AgentFinishAction, we will try again until it succeeds OR total 3 attempts have been made.
+    ITERATIVE_EVAL_MODE = (
+        os.environ.get('ITERATIVE_EVAL_MODE', 'false').lower() == 'true'
+    )
+    ITERATIVE_EVAL_MODE_MAX_ATTEMPTS = int(
+        os.environ.get('ITERATIVE_EVAL_MODE_MAX_ATTEMPTS', '3')
+    )
+
+    if not ITERATIVE_EVAL_MODE:
+        # load the dataset
+        instances = prepare_dataset(swe_bench_tests, output_file, args.eval_n_limit)
+        if len(instances) > 0 and not isinstance(
+            instances['PASS2PASS'][instances['PASS2PASS'].index[0]], str
+        ):
+            for col in ['PASS2PASS', 'FAIL2PASS']:
+                instances[col] = instances[col].apply(lambda x: str(x))
+
+        run_evaluation_nocode_bench(
+            instances,
+            metadata,
+            output_file,
+            args.eval_num_workers,
+            process_instance,
+            timeout_seconds=8
+            * 60
+            * 60,  # 8 hour PER instance should be more than enough
+            max_retries=5,
+        )
+    else:
+        critic = AgentFinishedCritic()
+
+        def get_cur_output_file_path(attempt: int) -> str:
+            return (
+                f'{output_file.removesuffix(".jsonl")}.critic_attempt_{attempt}.jsonl'
+            )
+
+        eval_ids = None
+        for attempt in range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1):
+            cur_output_file = get_cur_output_file_path(attempt)
+            logger.info(
+                f'Running evaluation with critic {critic.__class__.__name__} for attempt {attempt} of {ITERATIVE_EVAL_MODE_MAX_ATTEMPTS}.'
+            )
+
+            # For deterministic eval, we set temperature to 0.1 for (>1) attempt
+            # so hopefully we get slightly different results
+            if attempt > 1 and metadata.llm_config.temperature == 0:
+                logger.info(
+                    f'Detected temperature is 0 for (>1) attempt {attempt}. Setting temperature to 0.1...'
+                )
+                metadata.llm_config.temperature = 0.1
+
+            # Load instances - at first attempt, we evaluate all instances
+            # On subsequent attempts, we only evaluate the instances that failed the previous attempt determined by critic
+            instances = prepare_dataset(
+                swe_bench_tests, cur_output_file, args.eval_n_limit, eval_ids=eval_ids
+            )
+            if len(instances) > 0 and not isinstance(
+                instances['PASS2PASS'][instances['PASS2PASS'].index[0]], str
+            ):
+                for col in ['PASS2PASS', 'FAIL2PASS']:
+                    instances[col] = instances[col].apply(lambda x: str(x))
+
+            # Run evaluation - but save them to cur_output_file
+            logger.info(
+                f'Evaluating {len(instances)} instances for attempt {attempt}...'
+            )
+            run_evaluation_nocode_bench(
+                instances,
+                metadata,
+                cur_output_file,
+                args.eval_num_workers,
+                process_instance,
+                timeout_seconds=8
+                * 60
+                * 60,  # 8 hour PER instance should be more than enough
+                max_retries=5,
+            )
+
+            # When eval is done, we update eval_ids to the instances that failed the current attempt
+            instances_failed = []
+            logger.info(
+                f'Use critic {critic.__class__.__name__} to check {len(instances)} instances for attempt {attempt}...'
+            )
+            with open(cur_output_file, 'r') as f:
+                for line in f:
+                    instance = json.loads(line)
+                    try:
+                        history = [
+                            event_from_dict(event) for event in instance['history']
+                        ]
+                        critic_result = critic.evaluate(
+                            history, instance['test_result'].get('git_patch', '')
+                        )
+                        if not critic_result.success:
+                            instances_failed.append(instance['instance_id'])
+                    except Exception as e:
+                        logger.error(
+                            f'Error loading history for instance {instance["instance_id"]}: {e}'
+                        )
+                        instances_failed.append(instance['instance_id'])
+            logger.info(
+                f'{len(instances_failed)} instances failed the current attempt {attempt}: {instances_failed}'
+            )
+            eval_ids = instances_failed
+
+            # If no instances failed, we break
+            if len(instances_failed) == 0:
+                break
+
+        # Then we should aggregate the results from all attempts into the original output file
+        # and remove the intermediate files
+        logger.info(
+            'Aggregating results from all attempts into the original output file...'
+        )
+        fout = open(output_file, 'w')
+        added_instance_ids = set()
+        for attempt in reversed(range(1, ITERATIVE_EVAL_MODE_MAX_ATTEMPTS + 1)):
+            cur_output_file = get_cur_output_file_path(attempt)
+            if not os.path.exists(cur_output_file):
+                logger.warning(
+                    f'Intermediate output file {cur_output_file} does not exist. Skipping...'
+                )
+                continue
+
+            with open(cur_output_file, 'r') as f:
+                for line in f:
+                    instance = json.loads(line)
+                    # Also make sure git_patch is not empty - otherwise we fall back to previous attempt (empty patch is worse than anything else)
+                    if (
+                        instance['instance_id'] not in added_instance_ids
+                        and instance['test_result'].get('git_patch', '').strip()
+                    ):
+                        fout.write(line)
+                        added_instance_ids.add(instance['instance_id'])
+            logger.info(
+                f'Aggregated instances from {cur_output_file}. Total instances added so far: {len(added_instance_ids)}'
+            )
+        fout.close()
+        logger.info(
+            f'Done! Total {len(added_instance_ids)} instances added to {output_file}'
+        )
--- a/evaluation/benchmarks/nocode_bench/scripts/init.py
+++ b/evaluation/benchmarks/nocode_bench/scripts/init.py
--- a/evaluation/benchmarks/nocode_bench/scripts/eval/init.py
+++ b/evaluation/benchmarks/nocode_bench/scripts/eval/init.py
--- a/evaluation/benchmarks/nocode_bench/scripts/eval/convert.py
+++ b/evaluation/benchmarks/nocode_bench/scripts/eval/convert.py
@ -0,0 +1,33 @@
+import argparse
+import json
+
+
+def main(output_jsonl: str):
+    with open(output_jsonl, 'r') as f:
+        for line in f:
+            try:
+                output = json.loads(line)
+                pred = {
+                    'instance_id': output['instance_id'],
+                    'model_name_or_path': output['metadata']['llm_config']['model'],
+                    'model_patch': output['test_result']['git_patch'],
+                }
+            except Exception as e:
+                print(
+                    f'Error while reading output of instance {output["instance_id"]}: {e}'
+                )
+
+            print(json.dumps(pred))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--output_jsonl',
+        type=str,
+        required=True,
+        help='Path to the prediction file (.../outputs.jsonl)',
+    )
+    args = parser.parse_args()
+
+    main(args.output_jsonl)
--- a/evaluation/benchmarks/nocode_bench/scripts/eval/verify_costs.py
+++ b/evaluation/benchmarks/nocode_bench/scripts/eval/verify_costs.py
@ -0,0 +1,104 @@
+import argparse
+
+import pandas as pd
+
+from openhands.core.logger import openhands_logger as logger
+
+
+def verify_instance_costs(row: pd.Series) -> float:
+    """
+    Verifies that the accumulated_cost matches the sum of individual costs in metrics.
+    Also checks for duplicate consecutive costs which might indicate buggy counting.
+    If the consecutive costs are identical, the file is affected by this bug:
+    https://github.com/All-Hands-AI/OpenHands/issues/5383
+
+    Args:
+        row: DataFrame row containing instance data with metrics
+    Returns:
+        float: The verified total cost for this instance (corrected if needed)
+    """
+    try:
+        metrics = row.get('metrics')
+        if not metrics:
+            logger.warning(f'Instance {row["instance_id"]}: No metrics found')
+            return 0.0
+
+        accumulated = metrics.get('accumulated_cost')
+        costs = metrics.get('costs', [])
+
+        if accumulated is None:
+            logger.warning(
+                f'Instance {row["instance_id"]}: No accumulated_cost in metrics'
+            )
+            return 0.0
+
+        # Check for duplicate consecutive costs and systematic even-odd pairs
+        has_duplicate = False
+        all_pairs_match = True
+
+        # Check each even-odd pair (0-1, 2-3, etc.)
+        for i in range(0, len(costs) - 1, 2):
+            if abs(costs[i]['cost'] - costs[i + 1]['cost']) < 1e-6:
+                has_duplicate = True
+                logger.debug(
+                    f'Instance {row["instance_id"]}: Possible buggy double-counting detected! '
+                    f'Steps {i} and {i + 1} have identical costs: {costs[i]["cost"]:.2f}'
+                )
+            else:
+                all_pairs_match = False
+                break
+
+        # Calculate total cost, accounting for buggy double counting if detected
+        if len(costs) >= 2 and has_duplicate and all_pairs_match:
+            paired_steps_cost = sum(
+                cost_entry['cost']
+                for cost_entry in costs[: -1 if len(costs) % 2 else None]
+            )
+            real_paired_cost = paired_steps_cost / 2
+
+            unpaired_cost = costs[-1]['cost'] if len(costs) % 2 else 0
+            total_cost = real_paired_cost + unpaired_cost
+
+        else:
+            total_cost = sum(cost_entry['cost'] for cost_entry in costs)
+
+        if not abs(total_cost - accumulated) < 1e-6:
+            logger.warning(
+                f'Instance {row["instance_id"]}: Cost mismatch: '
+                f'accumulated: {accumulated:.2f}, sum of costs: {total_cost:.2f}, '
+            )
+
+        return total_cost
+
+    except Exception as e:
+        logger.error(
+            f'Error verifying costs for instance {row.get("instance_id", "UNKNOWN")}: {e}'
+        )
+        return 0.0
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Verify costs in SWE-bench output file'
+    )
+    parser.add_argument(
+        'input_filepath', type=str, help='Path to the output.jsonl file'
+    )
+    args = parser.parse_args()
+
+    try:
+        # Load and verify the JSONL file
+        df = pd.read_json(args.input_filepath, lines=True)
+        logger.info(f'Loaded {len(df)} instances from {args.input_filepath}')
+
+        # Verify costs for each instance and sum up total
+        total_cost = df.apply(verify_instance_costs, axis=1).sum()
+        logger.info(f'Total verified cost across all instances: ${total_cost:.2f}')
+
+    except Exception as e:
+        logger.error(f'Failed to process file: {e}')
+        raise
+
+
+if __name__ == '__main__':
+    main()
--- a/evaluation/benchmarks/nocode_bench/scripts/run_infer_nc.sh
+++ b/evaluation/benchmarks/nocode_bench/scripts/run_infer_nc.sh
@ -0,0 +1,146 @@
+#!/usr/bin/env bash
+set -eo pipefail
+
+source "evaluation/utils/version_control.sh"
+
+MODEL_CONFIG=$1
+COMMIT_HASH=$2
+AGENT=$3
+EVAL_LIMIT=$4
+MAX_ITER=$5
+NUM_WORKERS=$6
+DATASET=$7
+SPLIT=$8
+N_RUNS=$9
+MODE=${10}
+
+
+if [ -z "$NUM_WORKERS" ]; then
+  NUM_WORKERS=1
+  echo "Number of workers not specified, use default $NUM_WORKERS"
+fi
+checkout_eval_branch
+
+if [ -z "$AGENT" ]; then
+  echo "Agent not specified, use default CodeActAgent"
+  AGENT="CodeActAgent"
+fi
+
+if [ -z "$MAX_ITER" ]; then
+  echo "MAX_ITER not specified, use default 100"
+  MAX_ITER=100
+fi
+
+if [ -z "$RUN_WITH_BROWSING" ]; then
+  echo "RUN_WITH_BROWSING not specified, use default false"
+  RUN_WITH_BROWSING=false
+fi
+
+
+if [ -z "$DATASET" ]; then
+  echo "DATASET not specified, use default princeton-nlp/SWE-bench_Lite"
+  DATASET="princeton-nlp/SWE-bench_Lite"
+fi
+
+if [ -z "$SPLIT" ]; then
+  echo "SPLIT not specified, use default test"
+  SPLIT="test"
+fi
+
+if [ -z "$MODE" ]; then
+  MODE="swe"
+  echo "MODE not specified, use default $MODE"
+fi
+
+if [ -n "$EVAL_CONDENSER" ]; then
+  echo "Using Condenser Config: $EVAL_CONDENSER"
+else
+  echo "No Condenser Config provided via EVAL_CONDENSER, use default (NoOpCondenser)."
+fi
+
+export RUN_WITH_BROWSING=$RUN_WITH_BROWSING
+echo "RUN_WITH_BROWSING: $RUN_WITH_BROWSING"
+
+get_openhands_version
+
+echo "AGENT: $AGENT"
+echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
+echo "MODEL_CONFIG: $MODEL_CONFIG"
+echo "DATASET: $DATASET"
+echo "SPLIT: $SPLIT"
+echo "MAX_ITER: $MAX_ITER"
+echo "NUM_WORKERS: $NUM_WORKERS"
+echo "COMMIT_HASH: $COMMIT_HASH"
+echo "MODE: $MODE"
+echo "EVAL_CONDENSER: $EVAL_CONDENSER"
+
+# Default to NOT use Hint
+if [ -z "$USE_HINT_TEXT" ]; then
+  export USE_HINT_TEXT=false
+fi
+echo "USE_HINT_TEXT: $USE_HINT_TEXT"
+EVAL_NOTE="$OPENHANDS_VERSION"
+# if not using Hint, add -no-hint to the eval note
+if [ "$USE_HINT_TEXT" = false ]; then
+  EVAL_NOTE="$EVAL_NOTE-no-hint"
+fi
+
+if [ "$RUN_WITH_BROWSING" = true ]; then
+  EVAL_NOTE="$EVAL_NOTE-with-browsing"
+fi
+
+if [ -n "$EXP_NAME" ]; then
+  EVAL_NOTE="$EVAL_NOTE-$EXP_NAME"
+fi
+# if mode != swe, add mode to the eval note
+if [ "$MODE" != "swe" ]; then
+  EVAL_NOTE="${EVAL_NOTE}-${MODE}"
+fi
+# Add condenser config to eval note if provided
+if [ -n "$EVAL_CONDENSER" ]; then
+  EVAL_NOTE="${EVAL_NOTE}-${EVAL_CONDENSER}"
+fi
+
+function run_eval() {
+  local eval_note="${1}"
+  COMMAND="poetry run python evaluation/benchmarks/nocode_bench/run_infer_nc.py \
+    --agent-cls $AGENT \
+    --llm-config $MODEL_CONFIG \
+    --max-iterations $MAX_ITER \
+    --eval-num-workers $NUM_WORKERS \
+    --eval-note $eval_note \
+    --dataset $DATASET \
+    --split $SPLIT \
+    --mode $MODE"
+
+
+
+  if [ -n "$EVAL_LIMIT" ]; then
+    echo "EVAL_LIMIT: $EVAL_LIMIT"
+    COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
+  fi
+
+  # Run the command
+  eval $COMMAND
+}
+
+unset SANDBOX_ENV_GITHUB_TOKEN # prevent the agent from using the github token to push
+if [ -z "$N_RUNS" ]; then
+  N_RUNS=1
+  echo "N_RUNS not specified, use default $N_RUNS"
+fi
+
+# Skip runs if the run number is in the SKIP_RUNS list
+# read from env variable SKIP_RUNS as a comma separated list of run numbers
+SKIP_RUNS=(${SKIP_RUNS//,/ })
+for i in $(seq 1 $N_RUNS); do
+  if [[ " ${SKIP_RUNS[@]} " =~ " $i " ]]; then
+    echo "Skipping run $i"
+    continue
+  fi
+  current_eval_note="$EVAL_NOTE-run_$i"
+  echo "EVAL_NOTE: $current_eval_note"
+  run_eval $current_eval_note
+done
+
+checkout_original_branch
--- a/evaluation/benchmarks/nocode_bench/scripts/setup/compare_patch_filename.py
+++ b/evaluation/benchmarks/nocode_bench/scripts/setup/compare_patch_filename.py
@ -0,0 +1,54 @@
+"""This script compares gold patches with OpenHands-generated patches and check whether
+OpenHands found the right (set of) files to modify.
+"""
+
+import argparse
+import json
+import re
+
+
+def extract_modified_files(patch):
+    modified_files = set()
+    file_pattern = re.compile(r'^diff --git a/(.*?) b/')
+
+    for line in patch.split('\n'):
+        match = file_pattern.match(line)
+        if match:
+            modified_files.add(match.group(1))
+
+    return modified_files
+
+
+def process_report(oh_output_file):
+    succ = 0
+    fail = 0
+    for line in open(oh_output_file):
+        line = json.loads(line)
+        instance_id = line['instance_id']
+        gold_patch = line['swe_instance']['patch']
+        generated_patch = line['git_patch']
+        gold_modified_files = extract_modified_files(gold_patch)
+        # swe-bench lite only: a gold patch always contains exactly one file
+        assert len(gold_modified_files) == 1
+        generated_modified_files = extract_modified_files(generated_patch)
+
+        # Check if all files in gold_patch are also in generated_patch
+        all_files_in_generated = gold_modified_files.issubset(generated_modified_files)
+        if all_files_in_generated:
+            succ += 1
+        else:
+            fail += 1
+            print(
+                f'{instance_id}: file mismatch, gold = {gold_modified_files}, generated = {generated_modified_files}'
+            )
+    print(
+        f'\nSUMMARY: {succ} out of {succ + fail} instances found correct files to edit, success rate = {succ / float(succ + fail)}'
+    )
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--oh_output_file', help='Path to the OH output file')
+    args = parser.parse_args()
+
+    process_report(args.oh_output_file)
--- a/evaluation/benchmarks/nocode_bench/scripts/setup/instance_nc_entry.sh
+++ b/evaluation/benchmarks/nocode_bench/scripts/setup/instance_nc_entry.sh
@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+source ~/.bashrc
+SWEUTIL_DIR=/swe_util
+
+if [ -z "$SWE_INSTANCE_ID" ]; then
+    echo "Error: SWE_INSTANCE_ID is not set." >&2
+    exit 1
+fi
+
+
+item=$(jq --arg INSTANCE_ID "$SWE_INSTANCE_ID" '.[] | select(.instance_id == $INSTANCE_ID)' $SWEUTIL_DIR/eval_data/instances/swe-bench-instance.json)
+
+
+if [[ -z "$item" ]]; then
+  echo "No item found for the provided instance ID."
+  exit 1
+fi
+
+REPO_NAME=$(echo "$item" | jq -r '.repo | split("/")[-1]')
+WORKSPACE_NAME="$REPO_NAME"
+
+
+echo "WORKSPACE_NAME: $WORKSPACE_NAME"
+
+
+# Clear the workspace
+if [ -d /workspace ]; then
+    rm -rf /workspace/*
+else
+    mkdir /workspace
+fi
+# Copy repo to workspace
+if [ -d /workspace/$WORKSPACE_NAME ]; then
+    rm -rf /workspace/$WORKSPACE_NAME
+fi
+mkdir -p /workspace
+
+SRC_DIR="/root/$REPO_NAME"
+DEST_DIR="/workspace/$WORKSPACE_NAME"
+
+cp -r "$SRC_DIR" "$DEST_DIR"
+
+
+
+echo ">> Extracting conda environment name..."
+CONDA_ENV_NAME=$(echo "$item" | jq -r '.conda_env // empty')
+
+# Activate instance-specific environment
+if [ -d /opt/miniconda3 ]; then
+    . /opt/miniconda3/etc/profile.d/conda.sh
+    conda activate $CONDA_ENV_NAME
+fi
--- a/evaluation/benchmarks/nocode_bench/scripts/utils/evaluation_utils.py
+++ b/evaluation/benchmarks/nocode_bench/scripts/utils/evaluation_utils.py
@ -0,0 +1,154 @@
+import json
+import multiprocessing as mp
+from typing import Awaitable, Callable, TextIO
+
+import numpy as np
+import pandas as pd
+from pydantic import SecretStr
+from tqdm import tqdm
+
+from evaluation.utils.shared import (
+    EvalMetadata,
+    EvalOutput,
+    _process_instance_wrapper,
+    _process_instance_wrapper_mp,
+)
+from openhands.core.logger import openhands_logger as logger
+
+
+def update_progress_nc(
+    result: EvalOutput,
+    pbar: tqdm,
+    output_fp: TextIO,
+):
+    """Update the progress bar and write the result to the output file."""
+    pbar.update(1)
+    pbar.set_description(f'Instance {result.instance_id}')
+    pbar.set_postfix_str(f'Test Result: {str(result.test_result)[:300]}...')
+    logger.info(
+        f'Finished evaluation for instance {result.instance_id}: '
+        f'{str(result.test_result)[:300]}...\n'
+    )
+
+    def make_serializable(obj):
+        if isinstance(obj, pd.Series):
+            return make_serializable(obj.to_dict())
+
+        if isinstance(obj, dict):
+            return {k: make_serializable(v) for k, v in obj.items()}
+
+        elif isinstance(obj, (list, tuple, set)):
+            converted = [make_serializable(v) for v in obj]
+            if isinstance(obj, list):
+                return converted
+            elif isinstance(obj, tuple):
+                return tuple(converted)
+            else:  # set
+                return converted
+
+        elif isinstance(obj, np.ndarray):
+            return obj.tolist()
+
+        elif isinstance(obj, np.generic):
+            return obj.item()
+
+        elif isinstance(obj, pd.Timestamp):
+            return obj.isoformat()
+
+        elif SecretStr is not None and isinstance(obj, SecretStr):
+            return str(obj)
+
+        else:
+            return obj
+
+    try:
+        raw_data = result.model_dump(mode='python', round_trip=False)
+        safe_data = make_serializable(raw_data)
+        output_fp.write(json.dumps(safe_data, ensure_ascii=False) + '\n')
+        output_fp.flush()
+
+    except Exception as e:
+        logger.error(f'Failed to write full result: {e}')
+
+        fallback = {
+            'instance_id': result.instance_id,
+            'model_patch': result.test_result.get('git_patch', ''),
+        }
+        try:
+            output_fp.write(json.dumps(fallback, ensure_ascii=False) + '\n')
+            output_fp.flush()
+            logger.info(
+                f'Wrote fallback result for instance {result.instance_id}: only instance_id and model_patch.'
+            )
+        except Exception as e2:
+            logger.error(f'Failed to write fallback result: {e2}')
+
+
+def cleanup():
+    print('Cleaning up child processes...')
+    for process in mp.active_children():
+        print(f'Terminating child process: {process.name}')
+        process.terminate()
+        process.join()
+
+
+def run_evaluation_nocode_bench(
+    dataset: pd.DataFrame,
+    metadata: EvalMetadata | None,
+    output_file: str,
+    num_workers: int,
+    process_instance_func: Callable[
+        [pd.Series, EvalMetadata, bool], Awaitable[EvalOutput]
+    ],
+    max_retries: int = 5,  # number of retries for each instance
+    timeout_seconds: int | None = None,
+):
+    use_multiprocessing = num_workers > 1
+
+    if metadata is not None:
+        logger.info(
+            f'Evaluation started with Agent {metadata.agent_class}:\n'
+            f'model {metadata.llm_config.model}, max iterations {metadata.max_iterations}.\n'
+        )
+    else:
+        logger.warning('Running evaluation without metadata.')
+        logger.info(f'Evaluation started with {num_workers} workers.')
+
+    total_instances = len(dataset)
+    pbar = tqdm(total=total_instances, desc='Instances processed')
+    output_fp = open(output_file, 'a')
+
+    try:
+        if use_multiprocessing:
+            with mp.Pool(num_workers) as pool:
+                args_iter = (
+                    (
+                        process_instance_func,
+                        instance,
+                        metadata,
+                        True,
+                        max_retries,
+                        timeout_seconds,
+                    )
+                    for _, instance in dataset.iterrows()
+                )
+                results = pool.imap_unordered(_process_instance_wrapper_mp, args_iter)
+                for result in results:
+                    update_progress_nc(result, pbar, output_fp)
+        else:
+            for _, instance in dataset.iterrows():
+                result = _process_instance_wrapper(
+                    process_instance_func=process_instance_func,
+                    instance=instance,
+                    metadata=metadata,
+                    use_mp=False,
+                    max_retries=max_retries,
+                )
+                update_progress_nc(result, pbar, output_fp)
+
+    except KeyboardInterrupt:
+        print('\nKeyboardInterrupt received. Cleaning up...\n')
+        cleanup()
+
+    output_fp.close()
+    logger.info('\nEvaluation finished.\n')