From 74ba21bad063bbb092fdc07f8c81e4aaf047762e Mon Sep 17 00:00:00 2001 From: Kevin Musgrave Date: Mon, 18 Aug 2025 07:18:08 -0700 Subject: [PATCH] feat(evaluation): Added INSTRUCTION_TEMPLATE_NAME to run_infer.py in swe_bench (#10270) Co-authored-by: Xingyao Wang Co-authored-by: mamoodi --- evaluation/benchmarks/swe_bench/README.md | 3 +++ evaluation/benchmarks/swe_bench/run_infer.py | 5 ++++- evaluation/utils/shared.py | 2 ++ 3 files changed, 9 insertions(+), 1 deletion(-) diff --git a/evaluation/benchmarks/swe_bench/README.md b/evaluation/benchmarks/swe_bench/README.md index f51c56462e..cb6c56876a 100644 --- a/evaluation/benchmarks/swe_bench/README.md +++ b/evaluation/benchmarks/swe_bench/README.md @@ -93,6 +93,9 @@ export USE_HINT_TEXT=true # Ignore this if you are not sure. # Specify a condenser configuration for memory management (default: NoOpCondenser) export EVAL_CONDENSER=summarizer_for_eval # Name of the condenser config group in config.toml + +# Specify the instruction prompt template file name +export INSTRUCTION_TEMPLATE_NAME=swe_custom.j2 # Name of the file in the swe_bench/prompts folder. ``` Let's say you'd like to run 10 instances using `llm.eval_gpt4_1106_preview` and CodeActAgent, diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py index 715e84a354..4413f07d3b 100644 --- a/evaluation/benchmarks/swe_bench/run_infer.py +++ b/evaluation/benchmarks/swe_bench/run_infer.py @@ -108,7 +108,9 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageActio llm_model = metadata.llm_config.model # Determine the template file based on mode and LLM - if mode.startswith('swt'): + if metadata.instruction_template_name: + template_name = metadata.instruction_template_name + elif mode.startswith('swt'): template_name = 'swt.j2' elif mode == 'swe': if 'gpt-4.1' in llm_model: @@ -122,6 +124,7 @@ def get_instruction(instance: pd.Series, metadata: EvalMetadata) -> MessageActio logger.error(f'Unexpected evaluation mode: {mode}. Falling back to default.') template_name = 'swe_default.j2' + logger.debug(f'Using instruction template file: {template_name}') # Set up Jinja2 environment # Assuming templates are in 'evaluation/benchmarks/swe_bench/prompts' relative to this script prompts_dir = os.path.join(os.path.dirname(__file__), 'prompts') diff --git a/evaluation/utils/shared.py b/evaluation/utils/shared.py index e2102d16b1..f88dc6c974 100644 --- a/evaluation/utils/shared.py +++ b/evaluation/utils/shared.py @@ -53,6 +53,7 @@ class EvalMetadata(BaseModel): data_split: str | None = None details: dict[str, Any] | None = None condenser_config: CondenserConfig | None = None + instruction_template_name: str | None = None class EvalOutput(BaseModel): @@ -205,6 +206,7 @@ def make_metadata( condenser_config=condenser_config if condenser_config else NoOpCondenserConfig(), + instruction_template_name=os.environ.get('INSTRUCTION_TEMPLATE_NAME'), ) metadata_json = metadata.model_dump_json() logger.info(f'Metadata: {metadata_json}')