From be62ba6b354582524d8fcb5d4967e0604825a73a Mon Sep 17 00:00:00 2001
From: ASTONE <wutong8023@gmail.com>
Date: Sat, 14 Jun 2025 23:17:18 +1000
Subject: [PATCH] add_versicode (#8221)

---
 evaluation/benchmarks/versicode/README.md     | 103 ++++++
 .../inference_utils/api_code_migration.py     | 134 +++++++
 .../api_test_block_completion.py              | 141 +++++++
 .../versicode/inference_utils/test_block.py   | 118 ++++++
 .../inference_utils/test_migration.py         | 111 ++++++
 .../versicode/metric/compute_ism_pm_score.py  | 345 ++++++++++++++++++
 .../metric/compute_migration_cdc_score.py     | 165 +++++++++
 .../metric/compute_versicode_cdc_score.py     | 175 +++++++++
 .../metric/compute_versicode_em_score.py      | 175 +++++++++
 .../choose_core_line_from_block_versicode.py  | 107 ++++++
 ...oose_core_line_from_migration_versicode.py | 108 ++++++
 .../versicode/output_processing/clear_ans.py  |  36 ++
 .../benchmarks/versicode/requirements.txt     | 146 ++++++++
 13 files changed, 1864 insertions(+)
 create mode 100644 evaluation/benchmarks/versicode/README.md
 create mode 100644 evaluation/benchmarks/versicode/inference_utils/api_code_migration.py
 create mode 100644 evaluation/benchmarks/versicode/inference_utils/api_test_block_completion.py
 create mode 100644 evaluation/benchmarks/versicode/inference_utils/test_block.py
 create mode 100644 evaluation/benchmarks/versicode/inference_utils/test_migration.py
 create mode 100644 evaluation/benchmarks/versicode/metric/compute_ism_pm_score.py
 create mode 100644 evaluation/benchmarks/versicode/metric/compute_migration_cdc_score.py
 create mode 100644 evaluation/benchmarks/versicode/metric/compute_versicode_cdc_score.py
 create mode 100644 evaluation/benchmarks/versicode/metric/compute_versicode_em_score.py
 create mode 100644 evaluation/benchmarks/versicode/output_processing/choose_core_line_from_block_versicode.py
 create mode 100644 evaluation/benchmarks/versicode/output_processing/choose_core_line_from_migration_versicode.py
 create mode 100644 evaluation/benchmarks/versicode/output_processing/clear_ans.py
 create mode 100644 evaluation/benchmarks/versicode/requirements.txt

diff --git a/evaluation/benchmarks/versicode/README.md b/evaluation/benchmarks/versicode/README.md
new file mode 100644
index 0000000000..2bd4fba58f
--- /dev/null
+++ b/evaluation/benchmarks/versicode/README.md
@@ -0,0 +1,103 @@
+# VersiCode benchmark
+
+This project is used to evaluate the performance of the model on VersiCode. It includes:
+
+- data: the test data needed and the model outputs
+- inference_utils: inference scripts for ours tasks and models
+- metric: scripts for calculating various metric
+- output_processing: process the model output to facilitate the calculation of model metrics
+
+# Details
+
+1. **Prepare the environment**
+
+   ```shell
+   #create conda environment
+   conda create -n VersiCode python==3.12
+   
+   #install requirements
+   pip install -r requirements.txt
+   ```
+
+2. **Experiment Data**
+
+    To obtain the experimental data, please visit the Hugging Face link: https://huggingface.co/datasets/AstoneNg/VersiCode.
+    Locate the files `VersiCode_block_completion.json` and `VersiCode_migration.json` under the `experiment_data` directory, and place them in the `/data/test_data directory` of this project.
+
+
+3. **Model inference**
+
+   ```shell
+   #cd inference_utils directory
+   cd inference_utils
+   
+   #The script file starting with 'test' is used to test the local model
+   #The script file at the beginning of the API is used to test the API call model
+   
+   #block level code completipn
+   #Modify the 10th and 12th lines of code to specify the base URL and model name
+   python api_test_block_completion.py
+   #Modify the 30th line of code to specify the local model path
+   python test_block.py
+   
+   # code migration (migration order is 'old_to_new')
+   #Modify the 10th and 12th lines of code to specify the base URL and model name
+   python api_code_migration.py
+   #Modify the 30th line of code to specify the local model path
+   python test_migration.py
+   ```
+
+4. **Process output**
+   Process the output content of the model, remove redundant content, extract specified content for easy calculation of indicators.
+
+   ```shell
+   #cd output_processing
+   cd output_processing
+   
+   #Extract content from<start> and <end>
+   #Modify the 8th and 9th lines of code to specify the model and task granularity
+   python clear_ans.py
+   
+   #In the block completion task and migration task, cdc@k The calculation of indicators needs to be targeted at key rows,
+   #Modify lines 76 and 79 to specify the data path
+   python choose_core_line_from_block_versicode.py
+   python choose_core_line_from_migration_versicode.py
+   ```
+
+5. **Metric**
+   We have three metrics pass@k，em@k and cdc@k Due to our inability to automatically build a dynamic evaluation environment, we have not provided pass@k .
+
+   ```shell
+   #cd metric
+   cd metric
+   
+   #Modify lines 137-140 in migration task (compute_migration_cdc_score.py) or 143-145 in block and line completion task (compute_versicode_cdc_score.py and compute_versicode_em_score.py) of the code to specify the data path and calculate the k-value of the metric
+   python compute_migration_cdc_score.py
+   python compute_versicode_cdc_score.py
+   python compute_versicode_em_score.py
+   
+   #Notes
+   #We found limitations in the ISM@k and PM@k metrics for evaluating code generation, so they are used only as reference in our experiments.
+   #Modify lines 261-265 in block and line completion task of the code to specify the data path and calculate the k-value of the metric
+   python compute_ism_pm_score.py
+   ```
+
+# Citation
+
+```
+@article{versicode,
+  author={Tongtong Wu and Weigang Wu and Xingyu Wang and Kang Xu and Suyu Ma and Bo Jiang and Ping Yang and Zhenchang Xing and Yuan-Fang Li and Gholamreza Haffari},
+  title        = {VersiCode: Towards Version-controllable Code Generation},
+  journal      = {CoRR},
+  volume       = {abs/2406.07411},
+  year         = {2024},
+  url          = {https://arxiv.org/abs/2406.07411},
+}
+```
+
+**Github url**: https://github.com/wutong8023/VersiCode
+
+# Contributor
+
+[Tongtong Wu](https://scholar.google.com/citations?hl=zh-CN&user=u1Qp8lUAAAAJ&view_op=list_works&sortby=pubdate), [Weigang Wu](https://scholar.google.com/citations?hl=zh-CN&user=UneIZo8AAAAJ), [Xingyu Wang](https://scholar.google.com/citations?hl=zh-CN&user=wqPJcxcAAAAJ), [Kang Xu](https://scholar.google.com/citations?hl=zh-CN&user=N1UUDi0AAAAJ), [Suyu Ma](https://scholar.google.com/citations?hl=zh-CN&user=NJHR1ukAAAAJ), [Bo Jiang](https://wutong8023.site/VersiCode/), [Ping Yang](https://scholar.google.com/citations?view_op=list_works&hl=en&hl=en&user=hrogvxoAAAAJ), [Zhenchang Xing](https://scholar.google.com/citations?hl=zh-CN&user=0vCxuH4AAAAJ), [Yuan-Fang Li](https://scholar.google.com/citations?hl=zh-CN&user=wufXO1kAAAAJ), [Gholamreza Haffari](https://scholar.google.com/citations?hl=zh-CN&user=Perjx5EAAAAJ)
+
diff --git a/evaluation/benchmarks/versicode/inference_utils/api_code_migration.py b/evaluation/benchmarks/versicode/inference_utils/api_code_migration.py
new file mode 100644
index 0000000000..c99c9d0385
--- /dev/null
+++ b/evaluation/benchmarks/versicode/inference_utils/api_code_migration.py
@@ -0,0 +1,134 @@
+"""
+GPT performs line level generation prediction and truncates overly long tokens
+"""
+import json
+import openai
+from openai import OpenAI
+import os
+import tiktoken
+max_tokens = 127000   #gpt3.5 is 16ktoken    gpt4o is 128k
+model_name = ""
+
+os.environ["OPENAI_API_KEY"] = ""
+client = OpenAI()
+
+def truncate_text(text, max_tokens):
+    encoding = tiktoken.get_encoding("cl100k_base")
+    disallowed_special = ()
+
+    tokens = encoding.encode(text, disallowed_special=disallowed_special)
+    print(len(tokens))
+
+    if len(tokens) > max_tokens:
+        tokens = tokens[:max_tokens]
+
+    truncated_text = encoding.decode(tokens)
+
+    return truncated_text
+
+def predict(content, model_name):
+    response = client.chat.completions.create(
+        model=model_name,
+        messages=[
+            {
+                "role": "user",
+                "content": content
+            }
+        ],
+        frequency_penalty=0.1,
+        max_tokens=128,
+        logit_bias=None,
+        logprobs=None,
+        n=6,
+        presence_penalty=0.0,
+        seed=None,
+        stop=None,
+        stream=False,
+        temperature=0.8,
+        top_p=0.95
+    )
+    ans_list = []
+    choices_list = response.choices
+    for c in choices_list:
+        content = c.message.content
+        ans_list.append(content)
+    final_ans = str(ans_list)
+    return final_ans
+
+def bulid_prompt(description, old_version, old_code, new_version) -> str:
+    """
+    build prompt
+    :param version:
+    :param description:
+    :param masked_code:
+    :param options:
+    :return:
+    """
+    prompt = f"""
+    You are now a professional Python programming engineer. I will provide you with a code snippet and a description of its functionality, 
+    including the dependencies and versions used in the code. Then, I will provide the same dependencies but with a specified new version. 
+    Your task is to refactor the code using the methods provided by the specified new version and return the refactored code. 
+    Please note that you only need to return the refactored code and enclose it with <start> and <end>:
+    ###Functionality description of the code
+    {description}
+    ###Dependency and old version
+    {old_version}
+    ###Old version code
+    {old_code}
+    ###Dependency and new version
+    {new_version}
+    ###Refactored new code
+    """
+
+    return prompt
+
+
+json_path = '../data/test_data/VersiCode_migration.json'
+
+
+with open(json_path, 'r', encoding='utf-8')as fr:
+    lodict = json.load(fr)
+data_dict = lodict
+data_list = data_dict
+
+
+for data in data_list:
+    if "model_output" in data:
+        print(f"the {data_list.index(data) + 1} has already been predicted, skipping this data!")
+        continue
+    try:
+        print(f"Predicting {data_list.index(data) + 1} ")
+        old_version = data['dependency'] + data['old_version']  # package == x.x.x
+        new_version = data['dependency'] + data['new_version']  # package == x.x.x
+        description = data['description']  # 功能描述
+        old_code = data['old_code']  # mask后的代码
+
+        instruction = bulid_prompt(description, old_version, old_code, new_version)
+        truncated_text = truncate_text(instruction, max_tokens)
+        prediction = predict(truncated_text, model_name)
+
+        data['model_output'] = prediction
+    except Exception as e:
+        print(f"error：{e}")
+        print("save current data")
+        save_folder_path = os.path.join('../data/result_data/code_migration', model_name)
+        if not os.path.exists(save_folder_path):
+            os.makedirs(save_folder_path)
+        save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1])
+
+        with open(save_json_path, 'w', encoding='utf-8') as fw:
+            json.dump(data_dict, fw, indent=4, ensure_ascii=False)
+        break
+
+
+
+save_folder_path = os.path.join('../data/result_data/code_migration', model_name)
+if not os.path.exists(save_folder_path):
+    os.makedirs(save_folder_path)
+save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1])
+
+with open(save_json_path, 'w', encoding='utf-8')as fw:
+    json.dump(data_dict, fw, indent=4, ensure_ascii=False)
+
+
+
diff --git a/evaluation/benchmarks/versicode/inference_utils/api_test_block_completion.py b/evaluation/benchmarks/versicode/inference_utils/api_test_block_completion.py
new file mode 100644
index 0000000000..796672f0b0
--- /dev/null
+++ b/evaluation/benchmarks/versicode/inference_utils/api_test_block_completion.py
@@ -0,0 +1,141 @@
+"""
+GPT performs line level generation prediction and truncates overly long tokens
+"""
+import json
+import openai
+from openai import OpenAI
+import os
+import tiktoken
+max_tokens = 127000   #gpt3.5 is 16ktoken    gpt4o is 128k
+model_name = ""
+
+os.environ["OPENAI_API_KEY"] = ""
+client = OpenAI()
+
+def truncate_text(text, max_tokens):
+    encoding = tiktoken.get_encoding("cl100k_base")
+    disallowed_special = ()
+
+    tokens = encoding.encode(text, disallowed_special=disallowed_special)
+    print(len(tokens))
+
+    if len(tokens) > max_tokens:
+        tokens = tokens[:max_tokens]
+
+    truncated_text = encoding.decode(tokens)
+
+    return truncated_text
+
+def predict(content, model_name):
+    response = client.chat.completions.create(
+        model=model_name,
+        messages=[
+            {
+                "role": "user",
+                "content": content
+            }
+        ],
+        frequency_penalty=0.1,
+        max_tokens=128,
+        logit_bias=None,
+        logprobs=None,
+        n=6,
+        presence_penalty=0.0,
+        seed=None,
+        stop=None,
+        stream=False,
+        temperature=0.8,
+        top_p=0.95
+    )
+    ans_list = []
+    choices_list = response.choices
+    for c in choices_list:
+        content = c.message.content
+        ans_list.append(content)
+    final_ans = str(ans_list)
+    return final_ans
+
+def bulid_prompt(version, description) -> str:
+    """
+    build prompt
+    :param version:
+    :param description:
+    :param masked_code:
+    :param options:
+    :return:
+    """
+    prompt = f'''
+            You are a professional Python engineer, and I will provide functional descriptions and versions of specified dependency packages. 
+            You need to write code in Python to implement this feature based on the functional description and using the dependency package and version I specified. 
+            Please note that you only need to return the code that implements the function, and do not return any other content. 
+            Please use <start> and <end> to enclose the generated code. Here is an example:
+            ###Function Description：
+            The function of this code is to print the results predicted by calling the model using vllm.
+            ###dependeny and version：
+            vllm==0.3.3
+            ###response:
+            <start>
+            for output in outputs:
+                prompt = output.prompt
+                generated_text = output.outputs[0].text
+                print("Prompt,Generated text")
+            <end>
+
+            ###Function Description：
+            {description}
+            ###dependeny and version：
+            {version}
+            ###response:
+
+
+        '''
+    return prompt
+
+
+json_path = '../data/test_data/VersiCode_block_completion.json'
+
+
+with open(json_path, 'r', encoding='utf-8')as fr:
+    lodict = json.load(fr)
+data_dict = lodict
+data_list = data_dict
+
+
+for data in data_list:
+    if "model_output" in data:
+        print(f"the {data_list.index(data) + 1} has already been predicted, skipping this data!")
+        continue
+    try:
+        print(f"Predicting {data_list.index(data) + 1} ")
+        version = data['dependency'] + data['version']  # package == x.x.x
+        description = data['description']  # func description
+
+        instruction = bulid_prompt(version, description)
+        truncated_text = truncate_text(instruction, max_tokens)
+        prediction = predict(truncated_text, model_name)
+
+        data['model_output'] = prediction
+    except Exception as e:
+        print(f"error：{e}")
+        print("save current data")
+        save_folder_path = os.path.join('../data/result_data/block_completion', model_name)
+        if not os.path.exists(save_folder_path):
+            os.makedirs(save_folder_path)
+        save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1])
+
+        with open(save_json_path, 'w', encoding='utf-8') as fw:
+            json.dump(data_dict, fw, indent=4, ensure_ascii=False)
+        break
+
+
+
+save_folder_path = os.path.join('../data/result_data/block_completion', model_name)
+if not os.path.exists(save_folder_path):
+    os.makedirs(save_folder_path)
+save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1])
+
+with open(save_json_path, 'w', encoding='utf-8')as fw:
+    json.dump(data_dict, fw, indent=4, ensure_ascii=False)
+
+
+
diff --git a/evaluation/benchmarks/versicode/inference_utils/test_block.py b/evaluation/benchmarks/versicode/inference_utils/test_block.py
new file mode 100644
index 0000000000..1d34e55af9
--- /dev/null
+++ b/evaluation/benchmarks/versicode/inference_utils/test_block.py
@@ -0,0 +1,118 @@
+"""
+block completion
+"""
+import copy
+import json
+import os
+from vllm import LLM, SamplingParams
+import tiktoken
+import time
+import gc
+import torch
+from multiprocessing import Process
+
+# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+
+def truncate_text(text, max_tokens):
+    encoding = tiktoken.get_encoding("cl100k_base")
+    disallowed_special = ()
+
+    tokens = encoding.encode(text, disallowed_special=disallowed_special)
+    print(len(tokens))
+
+    if len(tokens) > max_tokens:
+        tokens = tokens[:max_tokens]
+
+    truncated_text = encoding.decode(tokens)
+
+    return truncated_text
+
+model_list = ['/data2/base models/starcoder2-15b', '/data2/base models/CodeGemma-7B']
+
+def run_inference(model_name, origin_data_list):
+    temp_data_list = copy.deepcopy(origin_data_list)
+    test_list = []
+    for data in temp_data_list:
+        version = data['dependency'] + data['version']  # package == x.x.x
+        description = data['description']  # func description
+
+        instruction = bulid_prompt(version, description)
+        test_list.append(instruction)
+
+    sampling_params = SamplingParams(n=6, temperature=0.8, top_p=0.95, max_tokens=64)
+    llm = LLM(model=model_name, tensor_parallel_size=4, gpu_memory_utilization=0.9, swap_space=20)
+
+    outputs = llm.generate(test_list, sampling_params)
+    for output in outputs:
+        requests_id = int(output.request_id)
+        temp_ans_list = []
+        output_list = output.outputs
+        for o in output_list:
+            text = o.text
+            temp_ans_list.append(text)
+
+        temp_data_list[requests_id]['model_output'] = str(temp_ans_list)
+
+    save_folder_path = os.path.join('../data/result_data/block_completion', model_name.split('/')[-1])
+    if not os.path.exists(save_folder_path):
+        os.makedirs(save_folder_path)
+
+    save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1])
+
+    with open(save_json_path, 'w', encoding='utf-8') as fw:
+        json.dump(temp_data_list, fw, indent=4, ensure_ascii=False)
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def bulid_prompt(version, description) -> str:
+    """
+    build prompt
+    :param version:
+    :param description:
+    :param masked_code:
+    :param options:
+    :return:
+    """
+    prompt = f'''
+            You are a professional Python engineer, and I will provide functional descriptions and versions of specified dependency packages. 
+            You need to write code in Python to implement this feature based on the functional description and using the dependency package and version I specified. 
+            Please note that you only need to return the code that implements the function, and do not return any other content. 
+            Please use <start> and <end> to enclose the generated code. Here is an example:
+            ###Function Description：
+            The function of this code is to print the results predicted by calling the model using vllm.
+            ###dependeny and version：
+            vllm==0.3.3
+            ###response:
+            <start>
+            for output in outputs:
+                prompt = output.prompt
+                generated_text = output.outputs[0].text
+                print("Prompt,Generated text")
+            <end>
+
+            ###Function Description：
+            {description}
+            ###dependeny and version：
+            {version}
+            ###response:
+
+
+        '''
+    return prompt
+
+
+json_path = '../data/test_data/VersiCode_block_completion.json'
+
+with open(json_path, 'r', encoding='utf-8')as fr:
+    lodict = json.load(fr)
+
+origin_data_list = lodict
+
+for model_name in model_list:
+    process = Process(target=run_inference, args=(model_name, origin_data_list))
+    process.start()
+    process.join()
+    time.sleep(120)
+
diff --git a/evaluation/benchmarks/versicode/inference_utils/test_migration.py b/evaluation/benchmarks/versicode/inference_utils/test_migration.py
new file mode 100644
index 0000000000..b088f69773
--- /dev/null
+++ b/evaluation/benchmarks/versicode/inference_utils/test_migration.py
@@ -0,0 +1,111 @@
+"""
+code migration
+"""
+import copy
+import json
+import os
+from vllm import LLM, SamplingParams
+import tiktoken
+import time
+import gc
+import torch
+from multiprocessing import Process
+
+# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
+
+def truncate_text(text, max_tokens):
+    encoding = tiktoken.get_encoding("cl100k_base")
+    disallowed_special = ()
+
+    tokens = encoding.encode(text, disallowed_special=disallowed_special)
+    print(len(tokens))
+
+    if len(tokens) > max_tokens:
+        tokens = tokens[:max_tokens]
+
+    truncated_text = encoding.decode(tokens)
+
+    return truncated_text
+
+model_list = ['/data2/base models/starcoder2-15b', '/data2/base models/CodeGemma-7B']
+
+def run_inference(model_name, origin_data_list):
+    temp_data_list = copy.deepcopy(origin_data_list)
+    test_list = []
+    for data in temp_data_list:
+        old_version = data['dependency'] + data['old_version']  # package == x.x.x
+        new_version = data['dependency'] + data['new_version']  # package == x.x.x
+        description = data['description']  # 功能描述
+        old_code = data['old_code']  # mask后的代码
+
+        instruction = bulid_prompt(description, old_version, old_code, new_version)
+        test_list.append(instruction)
+
+    sampling_params = SamplingParams(n=6, temperature=0.8, top_p=0.95, max_tokens=512)
+    llm = LLM(model=model_name, tensor_parallel_size=4, gpu_memory_utilization=0.6, swap_space=40)
+
+    outputs = llm.generate(test_list, sampling_params)
+    for output in outputs:
+        requests_id = int(output.request_id)
+        temp_ans_list = []
+        output_list = output.outputs
+        for o in output_list:
+            text = o.text
+            temp_ans_list.append(text)
+
+        temp_data_list[requests_id]['model_output'] = str(temp_ans_list)
+
+    save_folder_path = os.path.join('../data/result_data/code_migration', model_name.split('/')[-1])
+    if not os.path.exists(save_folder_path):
+        os.makedirs(save_folder_path)
+
+    save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1])
+
+    with open(save_json_path, 'w', encoding='utf-8') as fw:
+        json.dump(temp_data_list, fw, indent=4, ensure_ascii=False)
+
+    gc.collect()
+    torch.cuda.empty_cache()
+
+
+def bulid_prompt(description, old_version, old_code, new_version) -> str:
+    """
+    build prompt
+    :param version:
+    :param description:
+    :param masked_code:
+    :param options:
+    :return:
+    """
+    prompt = f"""
+    You are now a professional Python programming engineer. I will provide you with a code snippet and a description of its functionality, 
+    including the dependencies and versions used in the code. Then, I will provide the same dependencies but with a specified new version. 
+    Your task is to refactor the code using the methods provided by the specified new version and return the refactored code. 
+    Please note that you only need to return the refactored code and enclose it with <start> and <end>:
+    ###Functionality description of the code
+    {description}
+    ###Dependency and old version
+    {old_version}
+    ###Old version code
+    {old_code}
+    ###Dependency and new version
+    {new_version}
+    ###Refactored new code
+    """
+
+    return prompt
+
+
+json_path = '../data/test_data/VersiCode_migration.json'
+
+with open(json_path, 'r', encoding='utf-8')as fr:
+    lodict = json.load(fr)
+
+origin_data_list = lodict
+
+for model_name in model_list:
+    process = Process(target=run_inference, args=(model_name, origin_data_list))
+    process.start()
+    process.join()
+    time.sleep(120)
+
diff --git a/evaluation/benchmarks/versicode/metric/compute_ism_pm_score.py b/evaluation/benchmarks/versicode/metric/compute_ism_pm_score.py
new file mode 100644
index 0000000000..b0f79f025b
--- /dev/null
+++ b/evaluation/benchmarks/versicode/metric/compute_ism_pm_score.py
@@ -0,0 +1,345 @@
+"""
+评测block的预测能力
+1、判断是否包含正确的函数名
+2、判断是否合法
+3、计算ISM，和PM
+"""
+import json
+import tokenize
+import io
+import math
+import ast
+import re
+import os
+
+def is_code_valid(code):
+
+    try:
+        compile(code, '<string>', 'exec')
+        return True
+    except:
+        return False
+
+
+def longest_common_prefix_between_lists_with_elements(list1, list2):
+    """
+    计算两个字符串列表中元素的最长前缀匹配长度
+    :param list1:
+    :param list2:
+    :return:
+    """
+    max_prefix_length = 0
+    max_prefix_elements = ()
+    for str1 in list1:
+        for str2 in list2:
+            prefix_length = 0
+            min_len = min(len(str1), len(str2))
+            for i in range(min_len):
+                if str1[i] == str2[i]:
+                    prefix_length += 1
+                else:
+                    break
+            if prefix_length > max_prefix_length:
+                max_prefix_length = prefix_length
+                max_prefix_elements = (str1, str2)
+    return max_prefix_length, max_prefix_elements
+
+def get_token(ans_code:str, output_code:str):
+    """
+    对代码进行词法分析，分解成标识符，返回两个标识符列表
+    :param ans_code:
+    :param output_code:
+    :return:
+    """
+    output_flag = True
+    ans_flag = True
+    try:
+        tokens_ans = tokenize.tokenize(io.BytesIO(ans_code.encode('utf-8')).readline)
+    except Exception as e:
+        tokens_ans = ans_code.splitlines()
+        ans_flag = False
+
+    try:
+        tokens_output = tokenize.tokenize(io.BytesIO(output_code.encode('utf-8')).readline)
+    except Exception as e:
+        tokens_output = output_code.splitlines()
+        output_flag = False
+
+
+    identifiers_ans = []
+    identifiers_output = []
+    if ans_flag == True:
+        try:
+            for token in tokens_ans:
+                if token.type == tokenize.NAME:
+                    identifiers_ans.append(token.string)
+        except Exception as e:
+            identifiers_ans = tokens_ans
+    else:
+        identifiers_ans = tokens_ans
+
+    if output_flag == True:
+        try:
+            for to in tokens_output:
+                if to.type == tokenize.NAME:
+                    identifiers_output.append(to.string)
+        except Exception as e:
+            identifiers_output = tokens_output
+    else:
+        identifiers_output = tokens_output
+
+
+    return identifiers_ans, identifiers_output
+
+
+def get_token_per_line(code: str):
+    """
+    对每一行代码进行词法分析，记录每一行的标识符
+    :param code: 代码字符串
+    :return: 每一行的标识符列表组成的列表
+    """
+    lines = code.split('\n')  # 将代码按行分割成列表
+    identifiers_per_line = []  # 用于存储每一行的标识符列表的列表
+
+    for line in lines:
+        tokens = tokenize.tokenize(io.BytesIO(line.encode('utf-8')).readline)
+        identifiers = []
+        try:
+            for token in tokens:
+                if token.type == tokenize.NAME:
+                    identifiers.append(token.string)
+        except:
+            identifiers = line.split(' ')
+        identifiers_per_line.append(identifiers)
+
+    return identifiers_per_line
+
+
+
+def get_ISM(answer_code:str, model_output_list:list, asnwer_name:str)->list:
+    """
+    计算ISM，返回一个有序的得分列表
+    :return:
+    """
+    score_list = []
+    for code in model_output_list:
+        if '```python' in code:
+            code = code.replace('```python', '')
+            code = code.replace('```', '')
+        if not re.search(rf'\b{re.escape(asnwer_name)}\b', code) or is_code_valid(code) == False:
+            score_list.append(0)
+            continue
+
+        # if asnwer_name not in code:
+        #     score_list.append(0)
+        #     continue
+
+        identifiers_ans, identifiers_output = get_token(answer_code, code)
+        max_len, elements = longest_common_prefix_between_lists_with_elements(identifiers_ans, identifiers_output)
+        if max_len != 0:
+            base_element_len = max(len(elements[0]), len(elements[1]))
+            temp_score = max_len/base_element_len
+            score_list.append(temp_score)
+        else:
+            score_list.append(0)
+        # base_element_len = max(len(elements[0]), len(elements[1]))
+        # temp_score = max_len/base_element_len
+        # score_list.append(temp_score)
+
+    score_list = sorted(score_list, reverse=True)
+    return score_list
+
+def get_ISM_without_verification(answer_code:str, model_output_list:list, asnwer_name:str)->list:
+    """
+    计算ISM，返回一个有序的得分列表
+    :return:
+    """
+    score_list = []
+    for code in model_output_list:
+
+        if asnwer_name not in code:
+            score_list.append(0)
+            continue
+
+        # if asnwer_name not in code:
+        #     score_list.append(0)
+        #     continue
+
+        identifiers_ans, identifiers_output = get_token(answer_code, code)
+        max_len, elements = longest_common_prefix_between_lists_with_elements(identifiers_ans, identifiers_output)
+        if max_len != 0:
+            base_element_len = max(len(elements[0]), len(elements[1]))
+            temp_score = max_len/base_element_len
+            score_list.append(temp_score)
+        else:
+            score_list.append(0)
+        # base_element_len = max(len(elements[0]), len(elements[1]))
+        # temp_score = max_len/base_element_len
+        # score_list.append(temp_score)
+
+    score_list = sorted(score_list, reverse=True)
+    return score_list
+
+def longest_common_prefix_with_lengths(list1, list2):
+    """
+    计算两个二维列表中每个子列表的最长前缀匹配长度，并记录拥有最长前缀匹配长度的两个子列表的长度
+    :param list1: 第一个二维列表
+    :param list2: 第二个二维列表
+    :return: 最长前缀匹配长度以及拥有最长前缀匹配长度的两个子列表的长度
+    """
+    max_length = 0
+    len_list1 = 0
+    len_list2 = 0
+    for i, sublist1 in enumerate(list1):
+        for j, sublist2 in enumerate(list2):
+            match_length = 0
+            min_length = min(len(sublist1), len(sublist2))
+            for k in range(min_length):
+                if sublist1[k] == sublist2[k]:
+                    match_length += 1
+                else:
+                    break
+            if match_length > max_length:
+                max_length = match_length
+                len_list1 = len(sublist1)
+                len_list2 = len(sublist2)
+    return max_length, len_list1, len_list2
+
+
+def get_PM(answer_code:str, model_output_list:list, asnwer_name:str)->list:
+    """
+    计算PM，返回一个有序的得分列表
+    :return:
+    """
+    score_list = []
+    for code in model_output_list:
+        if '```python' in code:
+            code = code.replace('```python', '')
+            code = code.replace('```', '')
+        if not re.search(rf'\b{re.escape(asnwer_name)}\b', code) or is_code_valid(code) == False:
+
+        # if asnwer_name not in code or is_code_valid(code) == False:
+            score_list.append(0)
+            continue
+
+        # if asnwer_name not in code:
+        #     score_list.append(0)
+        #     continue
+
+        ans_list = get_token_per_line(answer_code)
+        output_token_list = get_token_per_line(code)
+        max_len, len1, len2 = longest_common_prefix_with_lengths(ans_list, output_token_list)
+        base_element_len = max(len1, len2)
+
+        if base_element_len != 0:
+            temp_score = max_len/base_element_len
+            score_list.append(temp_score)
+        else:
+            score_list.append(0)
+
+    score_list = sorted(score_list, reverse=True)
+    return score_list
+
+def get_score(score_list:list, k):
+    """
+    计算score@n,k
+    :param score_list:
+    :param k:
+    :return:
+    """
+    n = len(score_list)
+    sum = 0
+    final = n-k+1
+    for i in range(1, final+1):
+        sum += math.comb(n-i, k-1) * score_list[i-1]
+
+    final_score = sum/math.comb(n, k)
+
+    return final_score
+
+
+k = 1
+task = 'block'  # block or line
+json_name = f"Versicode_{task}_completion.json"
+
+folder_path = f'../data/result_data/{task}_completion'
+model_list = os.listdir(folder_path)
+
+for model in model_list:
+    model_json_path = os.path.join(folder_path, model, json_name)
+    with open(model_json_path, 'r', encoding='utf-8')as fr:
+        lodict = json.load(fr)
+    data_dict = lodict
+    data_list = data_dict
+    data_len = len(data_list)
+    sum_ISM = 0
+    sum_PM = 0
+
+    for data in data_list:
+        # model_output_list = eval(data['model_output'])
+        model_output_list = eval(data['model_output_clear'])[:1]
+        temp_list = []
+        for o in model_output_list:
+            temp_out = o.replace('```python', '')
+            temp_out = temp_out.replace('```', '')
+            temp_list.append(temp_out)
+        model_output_list = temp_list
+        answer_code = data['code']
+        answer_name = data['core_token']
+        #
+        # answer_code = data['new_code']  #code editing
+        # answer_name = data['new_name']    #code editing
+
+        # answer_code = data['old_code']  # code editing new to old
+        # answer_name = data['old_name']  # code editing new to old
+        #
+        ISM_score_list = get_ISM(answer_code, model_output_list, answer_name)
+        # ISM_score_without_verification_list = get_ISM_without_verification(answer_code, model_output_list, answer_name)     #新增
+        PM_score_list = get_PM(answer_code, model_output_list, answer_name)
+
+        # if not ISM_score_without_verification_list == ISM_score_list:#新增
+        #     for s in ISM_score_list:#新增
+        #         if s != ISM_score_without_verification_list[ISM_score_list.index(s)]:#新增
+        #             print('元数据如下')#新增
+        #             print(data)#新增
+        #             print('答案如下')#新增
+        #             print(model_output_list[ISM_score_list.index(s)])#新增
+
+                    # flag = int(input('输入1继续，0退出'))#新增
+                    # if flag == 1:
+                    #     continue
+
+
+        ISM_score = get_score(ISM_score_list, k)
+        PM_score = get_score(PM_score_list, k)
+
+        sum_ISM += ISM_score
+        sum_PM += PM_score
+        # print(f"ISM分数：{ISM_score}")
+        # print(f"PM分数：{PM_score}")
+
+    print(f"{model}, {task} completion task, ISM@{k} score: {sum_ISM/data_len}")
+    print(f"{model}, {task} completion task, PM@{k} score: {sum_PM/data_len}")
+
+
+
+# def get_token(ans_code:str, output_code:str):
+#     """
+#     对代码进行词法分析，分解成标识符，返回两个标识符列表
+#     :param ans_code:
+#     :param output_code:
+#     :return:
+#     """
+#     tokens_ans = tokenize.tokenize(io.BytesIO(ans_code.encode('utf-8')).readline)
+#     tokens_output = tokenize.tokenize(io.BytesIO(output_code.encode('utf-8')).readline)
+#     identifiers_ans = []
+#     identifiers_output = []
+#     for token in tokens_ans:
+#         if token.type == tokenize.NAME:
+#             identifiers_ans.append(token.string)
+#
+#     for to in tokens_output:
+#         if to.type == tokenize.NAME:
+#             identifiers_output.append(to.string)
+#
+#     return identifiers_ans, identifiers_output
\ No newline at end of file
diff --git a/evaluation/benchmarks/versicode/metric/compute_migration_cdc_score.py b/evaluation/benchmarks/versicode/metric/compute_migration_cdc_score.py
new file mode 100644
index 0000000000..836470c6a3
--- /dev/null
+++ b/evaluation/benchmarks/versicode/metric/compute_migration_cdc_score.py
@@ -0,0 +1,165 @@
+"""
+Calculate the cdc score for migration
+"""
+import os
+import json
+import math
+import re
+import warnings
+# warnings.filterwarnings("ignore", category=SyntaxWarning)
+
+def is_correct_parameter_count(function_name, correct_code, test_code):
+    """
+    判断参数数量是否一致
+    :param function_name:
+    :param correct_code:
+    :param test_code:
+    :return:
+    """
+    # 获取正确代码中的参数数量
+    # return True
+    pattern = rf'{function_name}\((.*?)\)'
+    correct_match = re.search(pattern, correct_code)
+
+    if correct_match:
+        correct_params = correct_match.group(1).strip()
+        correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()]
+        expected_count = len(correct_param_list)
+    else:
+        expected_count = 0  # 如果没有参数，期望数量为0
+
+    # 在需要判断的代码中查找函数调用
+    test_match = re.search(pattern, test_code)
+
+    if test_match:
+        test_params = test_match.group(1).strip()
+        test_param_list = [p.strip() for p in test_params.split(',') if p.strip()]
+        return len(test_param_list) == expected_count  # 检查参数数量
+    else:
+        # 如果没有括号，检查函数名是否在字符串中
+        return expected_count == 0 and function_name in test_code
+
+def check_keyword_parameters(function_name, correct_code, test_code):
+    """
+    判断关键词参数赋值是否正确使用
+    :param function_name:
+    :param correct_code:
+    :param test_code:
+    :return:
+    """
+    # 正则表达式匹配正确代码中的函数调用
+    # return True
+    pattern = rf'{function_name}\((.*?)\)'
+    correct_match = re.search(pattern, correct_code)
+
+    if correct_match:
+        correct_params = correct_match.group(1).strip()
+        correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()]
+
+        # 检查待检测代码中的函数调用
+        test_match = re.search(pattern, test_code)
+
+        if test_match:
+            test_params = test_match.group(1).strip()
+            test_param_list = [p.strip() for p in test_params.split(',') if p.strip()]
+
+            # 确保待检测的每个参数都以关键字参数形式赋值
+            for correct_param in correct_param_list:
+                if '=' in correct_param:  # 仅当正确代码中有关键词参数
+                    param_name = correct_param.split('=')[0].strip()
+                    if not any(param_name in test_param and '=' in test_param for test_param in test_param_list):
+                        return False  # 如果对应参数不是关键词参数，则返回False
+
+            return True  # 所有关键字参数匹配
+
+    return False  # 如果没有匹配，返回False
+
+def with_correct(answer_code:str, model_output:str)->bool:
+    """
+    当answer是with结构时，判断模型生成的是不是with结构
+    :param answer_code:
+    :param model_output:
+    :return:
+    """
+    # return True
+    if not answer_code.startswith('with') and not model_output.startswith('with'):
+        return True
+    elif answer_code.startswith('with') and model_output.startswith('with'):
+        return True
+    else:
+        return False
+
+def compute_block_score_k(answer:str, model_output:list, k:int, model_filled_code, core_line_in_core_block, core_line_in_output_clear):
+    """
+    cdc需要满足五个条件，em只需要满足第一个条件
+    """
+    c = 0
+    n = len(model_output)
+    for index, code in enumerate(model_output):
+        if re.search(rf'\b{re.escape(answer)}\b', code) and is_code_valid(model_filled_code[index]) and is_correct_parameter_count(answer, core_line_in_core_block, core_line_in_output_clear[index]) and with_correct(core_line_in_core_block, core_line_in_output_clear[index]) and check_keyword_parameters(answer, core_line_in_core_block, core_line_in_output_clear[index]):#block
+        # if re.search(rf'\b{re.escape(answer)}\b', code):#block
+            c += 1
+    if n-c < k:
+        return 1.0
+
+    score = 1 - (math.comb(n - c, k))/(math.comb(n, k))
+
+    return score
+
+
+def is_code_valid(code):
+
+    try:
+        compile(code, '<string>', 'exec')
+        return True
+    except:
+        return False
+
+def compute_score_k(answer:str, model_output:list, k:int):
+
+    c = 0
+    n = len(model_output)
+    for output in model_output:
+        if '```python' in output:
+            output = output.replace('```python', '')
+            output = output.replace('```', '')
+        # if answer == output:
+
+        if re.search(rf'\b{re.escape(answer)}\b', output) and is_code_valid(output) == True:
+            c += 1
+    if n-c < k:
+        return 1.0
+
+    score = 1 - (math.comb(n - c, k))/(math.comb(n, k))
+
+    return score
+
+k = 1  #cdc@k
+json_name = 'VersiCode_migration.json'
+task = 'migration'
+folder_path = f'../data/result_data/code_migration'
+
+model_list = os.listdir(folder_path)
+for model in model_list:
+    # if model != 'gpt-4o':
+    #     continue
+    model_json_path = os.path.join(folder_path, model, json_name)
+    with open(model_json_path, 'r', encoding='utf-8')as fr:
+        lodict = json.load(fr)
+    data_list = lodict
+
+    score_list = []
+    for data in data_list:
+        answer = data['new_name']# old -> new
+        model_output = data[f'model_output_clear']# old -> new
+
+        model_filled_code = model_output
+        # core_line_in_core_block = data['core_line_in_new_core_block']# old -> new
+        core_line_in_core_block = data['core_line_in_code']  # old -> new
+        core_line_in_output_clear = data['core_line_in_output_clear']# old -> new
+
+
+        score_list.append(compute_block_score_k(answer, model_output, k, model_filled_code, core_line_in_core_block, core_line_in_output_clear))
+
+    final_score = sum(score_list)/len(score_list)
+    print(f"{model}, {task} task, cdc@{k} score: {final_score}")
diff --git a/evaluation/benchmarks/versicode/metric/compute_versicode_cdc_score.py b/evaluation/benchmarks/versicode/metric/compute_versicode_cdc_score.py
new file mode 100644
index 0000000000..81c3cc4dfa
--- /dev/null
+++ b/evaluation/benchmarks/versicode/metric/compute_versicode_cdc_score.py
@@ -0,0 +1,175 @@
+"""
+Calculate the cdc score for line and block
+"""
+import os
+import json
+import math
+import re
+import warnings
+# warnings.filterwarnings("ignore", category=SyntaxWarning)
+
+def is_code_valid(code):
+
+    try:
+        compile(code, '<string>', 'exec')
+        return True
+    except:
+        return False
+
+def is_correct_parameter_count(function_name, correct_code, test_code):
+    """
+    判断参数数量是否一致
+    :param function_name:
+    :param correct_code:
+    :param test_code:
+    :return:
+    """
+    # 获取正确代码中的参数数量
+    # return True
+    pattern = rf'{function_name}\((.*?)\)'
+    correct_match = re.search(pattern, correct_code)
+
+    if correct_match:
+        correct_params = correct_match.group(1).strip()
+        correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()]
+        expected_count = len(correct_param_list)
+    else:
+        expected_count = 0  # 如果没有参数，期望数量为0
+
+    # 在需要判断的代码中查找函数调用
+    test_match = re.search(pattern, test_code)
+
+    if test_match:
+        test_params = test_match.group(1).strip()
+        test_param_list = [p.strip() for p in test_params.split(',') if p.strip()]
+        return len(test_param_list) == expected_count  # 检查参数数量
+    else:
+        # 如果没有括号，检查函数名是否在字符串中
+        return expected_count == 0 and function_name in test_code
+
+def check_keyword_parameters(function_name, correct_code, test_code):
+    """
+    判断关键词参数赋值是否正确使用
+    :param function_name:
+    :param correct_code:
+    :param test_code:
+    :return:
+    """
+    # 正则表达式匹配正确代码中的函数调用
+    # return True
+    pattern = rf'{function_name}\((.*?)\)'
+    correct_match = re.search(pattern, correct_code)
+
+    if correct_match:
+        correct_params = correct_match.group(1).strip()
+        correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()]
+
+        # 检查待检测代码中的函数调用
+        test_match = re.search(pattern, test_code)
+
+        if test_match:
+            test_params = test_match.group(1).strip()
+            test_param_list = [p.strip() for p in test_params.split(',') if p.strip()]
+
+            # 确保待检测的每个参数都以关键字参数形式赋值
+            for correct_param in correct_param_list:
+                if '=' in correct_param:  # 仅当正确代码中有关键词参数
+                    param_name = correct_param.split('=')[0].strip()
+                    if not any(param_name in test_param and '=' in test_param for test_param in test_param_list):
+                        return False  # 如果对应参数不是关键词参数，则返回False
+
+            return True  # 所有关键字参数匹配
+
+    return False  # 如果没有匹配，返回False
+
+def with_correct(answer_code:str, model_output:str)->bool:
+    """
+    当answer是with结构时，判断模型生成的是不是with结构
+    :param answer_code:
+    :param model_output:
+    :return:
+    """
+    # return True
+    if not answer_code.startswith('with') and not model_output.startswith('with'):
+        return True
+    elif answer_code.startswith('with') and model_output.startswith('with'):
+        return True
+    else:
+        return False
+
+def compute_line_score_k(answer:str, model_output:list, k:int, model_filled_code, core_line):
+
+    c = 0
+    n = len(model_output)
+    for index, code in enumerate(model_output):
+        if re.search(rf'\b{re.escape(answer)}\b', code) and is_code_valid(model_filled_code[index]) == True and is_correct_parameter_count(answer, core_line, code) and with_correct(core_line, code) and check_keyword_parameters(answer, core_line, code):#line
+            c += 1
+    if n-c < k:
+        return 1.0
+
+    score = 1 - (math.comb(n - c, k))/(math.comb(n, k))
+
+    return score
+
+def compute_block_score_k(answer:str, model_output:list, k:int, model_filled_code, core_line_in_core_block, core_line_in_output_clear):
+
+    c = 0
+    n = len(model_output)
+    for index, code in enumerate(model_output):
+        if re.search(rf'\b{re.escape(answer)}\b', code) and is_code_valid(model_filled_code[index]) and is_correct_parameter_count(answer, core_line_in_core_block, core_line_in_output_clear[index]) and with_correct(core_line_in_core_block, core_line_in_output_clear[index]) and check_keyword_parameters(answer, core_line_in_core_block, core_line_in_output_clear[index]):#block
+            c += 1
+    if n-c < k:
+        return 1.0
+
+    score = 1 - (math.comb(n - c, k))/(math.comb(n, k))
+
+    return score
+
+def compute_score_k(answer:str, model_output:list, k:int):
+
+    c = 0
+    n = len(model_output)
+    for index, code in enumerate(model_output):
+        if re.search(rf'\b{re.escape(answer)}\b', code) and is_code_valid(code):#block
+        # if re.search(rf'\b{re.escape(answer)}\b', code):#line
+            c += 1
+    if n-c < k:
+        return 1.0
+
+    score = 1 - (math.comb(n - c, k))/(math.comb(n, k))
+
+    return score
+
+k = 3   #cdc@k
+task = 'block' # line or block
+json_name = f"Versicode_{task}_completion.json"
+
+folder_path = f'../data/result_data/{task}_completion'
+model_list = os.listdir(folder_path)
+
+for model in model_list:
+    model_json_path = os.path.join(folder_path, model, json_name)
+    with open(model_json_path, 'r', encoding='utf-8')as fr:
+        lodict = json.load(fr)
+    data_list = lodict
+
+    if task == 'line':
+        score_list = []
+        for data in data_list:
+            answer = data['core_token']
+            model_output = eval(data['model_output_clear'])
+            model_filled_code = [data['masked_code'].replace('<mask>', i) for i in model_output]
+            core_line = data['core_line']
+            score_list.append(compute_line_score_k(answer, model_output, k, model_filled_code, core_line))
+    else:
+        score_list = []
+        for data in data_list:
+            answer = data['core_token']
+            model_output = eval(data['model_output_clear'])
+            model_filled_code = eval(data['model_output_clear'])
+            core_line = data['core_line']
+            core_line_in_output_clear = data['core_line_in_output_clear']
+            score_list.append(compute_block_score_k(answer, model_output, k, model_filled_code, core_line, core_line_in_output_clear))
+
+    final_score = sum(score_list)/len(score_list)
+    print(f"{model}, {task} completion task, cdc@{k} score: {final_score}")
diff --git a/evaluation/benchmarks/versicode/metric/compute_versicode_em_score.py b/evaluation/benchmarks/versicode/metric/compute_versicode_em_score.py
new file mode 100644
index 0000000000..bb3a363fad
--- /dev/null
+++ b/evaluation/benchmarks/versicode/metric/compute_versicode_em_score.py
@@ -0,0 +1,175 @@
+"""
+Calculate the cdc score for line and block
+"""
+import os
+import json
+import math
+import re
+import warnings
+# warnings.filterwarnings("ignore", category=SyntaxWarning)
+
+def is_code_valid(code):
+
+    try:
+        compile(code, '<string>', 'exec')
+        return True
+    except:
+        return False
+
+def is_correct_parameter_count(function_name, correct_code, test_code):
+    """
+    判断参数数量是否一致
+    :param function_name:
+    :param correct_code:
+    :param test_code:
+    :return:
+    """
+    # 获取正确代码中的参数数量
+    # return True
+    pattern = rf'{function_name}\((.*?)\)'
+    correct_match = re.search(pattern, correct_code)
+
+    if correct_match:
+        correct_params = correct_match.group(1).strip()
+        correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()]
+        expected_count = len(correct_param_list)
+    else:
+        expected_count = 0  # 如果没有参数，期望数量为0
+
+    # 在需要判断的代码中查找函数调用
+    test_match = re.search(pattern, test_code)
+
+    if test_match:
+        test_params = test_match.group(1).strip()
+        test_param_list = [p.strip() for p in test_params.split(',') if p.strip()]
+        return len(test_param_list) == expected_count  # 检查参数数量
+    else:
+        # 如果没有括号，检查函数名是否在字符串中
+        return expected_count == 0 and function_name in test_code
+
+def check_keyword_parameters(function_name, correct_code, test_code):
+    """
+    判断关键词参数赋值是否正确使用
+    :param function_name:
+    :param correct_code:
+    :param test_code:
+    :return:
+    """
+    # 正则表达式匹配正确代码中的函数调用
+    # return True
+    pattern = rf'{function_name}\((.*?)\)'
+    correct_match = re.search(pattern, correct_code)
+
+    if correct_match:
+        correct_params = correct_match.group(1).strip()
+        correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()]
+
+        # 检查待检测代码中的函数调用
+        test_match = re.search(pattern, test_code)
+
+        if test_match:
+            test_params = test_match.group(1).strip()
+            test_param_list = [p.strip() for p in test_params.split(',') if p.strip()]
+
+            # 确保待检测的每个参数都以关键字参数形式赋值
+            for correct_param in correct_param_list:
+                if '=' in correct_param:  # 仅当正确代码中有关键词参数
+                    param_name = correct_param.split('=')[0].strip()
+                    if not any(param_name in test_param and '=' in test_param for test_param in test_param_list):
+                        return False  # 如果对应参数不是关键词参数，则返回False
+
+            return True  # 所有关键字参数匹配
+
+    return False  # 如果没有匹配，返回False
+
+def with_correct(answer_code:str, model_output:str)->bool:
+    """
+    当answer是with结构时，判断模型生成的是不是with结构
+    :param answer_code:
+    :param model_output:
+    :return:
+    """
+    # return True
+    if not answer_code.startswith('with') and not model_output.startswith('with'):
+        return True
+    elif answer_code.startswith('with') and model_output.startswith('with'):
+        return True
+    else:
+        return False
+
+def compute_line_score_k(answer:str, model_output:list, k:int, model_filled_code, core_line):
+
+    c = 0
+    n = len(model_output)
+    for index, code in enumerate(model_output):
+        if re.search(rf'\b{re.escape(answer)}\b', code):#line
+            c += 1
+    if n-c < k:
+        return 1.0
+
+    score = 1 - (math.comb(n - c, k))/(math.comb(n, k))
+
+    return score
+
+def compute_block_score_k(answer:str, model_output:list, k:int, model_filled_code, core_line_in_core_block, core_line_in_output_clear):
+
+    c = 0
+    n = len(model_output)
+    for index, code in enumerate(model_output):
+        if re.search(rf'\b{re.escape(answer)}\b', code):#block
+            c += 1
+    if n-c < k:
+        return 1.0
+
+    score = 1 - (math.comb(n - c, k))/(math.comb(n, k))
+
+    return score
+
+def compute_score_k(answer:str, model_output:list, k:int):
+
+    c = 0
+    n = len(model_output)
+    for index, code in enumerate(model_output):
+        if re.search(rf'\b{re.escape(answer)}\b', code) and is_code_valid(code):#block
+        # if re.search(rf'\b{re.escape(answer)}\b', code):#line
+            c += 1
+    if n-c < k:
+        return 1.0
+
+    score = 1 - (math.comb(n - c, k))/(math.comb(n, k))
+
+    return score
+
+k = 3   #em@k
+task = 'block' # line or block
+json_name = f"Versicode_{task}_completion.json"
+
+folder_path = f'../data/result_data/{task}_completion'
+model_list = os.listdir(folder_path)
+
+for model in model_list:
+    model_json_path = os.path.join(folder_path, model, json_name)
+    with open(model_json_path, 'r', encoding='utf-8')as fr:
+        lodict = json.load(fr)
+    data_list = lodict
+
+    if task == 'line':
+        score_list = []
+        for data in data_list:
+            answer = data['core_token']
+            model_output = eval(data['model_output_clear'])
+            model_filled_code = [data['masked_code'].replace('<mask>', i) for i in model_output]
+            core_line = data['core_line']
+            score_list.append(compute_line_score_k(answer, model_output, k, model_filled_code, core_line))
+    else:
+        score_list = []
+        for data in data_list:
+            answer = data['core_token']
+            model_output = eval(data['model_output_clear'])
+            model_filled_code = eval(data['model_output_clear'])
+            core_line = data['core_line']
+            core_line_in_output_clear = data['core_line_in_output_clear']
+            score_list.append(compute_block_score_k(answer, model_output, k, model_filled_code, core_line, core_line_in_output_clear))
+
+    final_score = sum(score_list)/len(score_list)
+    print(f"{model}, {task} completion task, em@{k} score: {final_score}")
diff --git a/evaluation/benchmarks/versicode/output_processing/choose_core_line_from_block_versicode.py b/evaluation/benchmarks/versicode/output_processing/choose_core_line_from_block_versicode.py
new file mode 100644
index 0000000000..78632625a5
--- /dev/null
+++ b/evaluation/benchmarks/versicode/output_processing/choose_core_line_from_block_versicode.py
@@ -0,0 +1,107 @@
+"""
+Find the line of code generated by the model using the block in the version code
+"""
+import os
+import re
+import json
+import random
+
+def process_line_mask(code_snippet, core_token):
+    if not core_token:
+
+        return None, None
+
+
+    replaced_lines = {}
+    lines = code_snippet.split("\n")
+
+
+    in_multi_line_comment = False
+
+
+    for i, line in enumerate(lines):
+        if in_multi_line_comment:
+
+            if ('"""' in line or "'''" in line) and not re.findall(r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line):
+                in_multi_line_comment = False
+            continue
+        elif line.strip().startswith("#"):
+
+            continue
+        elif re.findall(r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line):
+
+            continue
+        elif ('"""' in line or "'''" in line) and not re.findall(r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line):
+
+            in_multi_line_comment = True
+            continue
+        else:
+
+            if re.search(r'\bdef\s+task_function\b', line):
+                continue
+
+
+            if re.search(r'\b{}\b(?!\s*=)'.format(re.escape(core_token)), line):
+
+                replaced_lines.update({i: line})
+
+    if replaced_lines:
+        random_line_location = random.choice(list(replaced_lines.keys()))
+
+        masked_line = lines[random_line_location]
+        leading_spaces = re.match(r'^\s*', masked_line).group(0)
+        masked_line = masked_line.strip()
+        lines[random_line_location] = leading_spaces + "<line_mask>"
+
+        masked_code = '\n'.join(lines)
+
+        return masked_code, masked_line
+
+    return None, None
+
+
+def load_json(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+
+
+def save_json(file_path, data):
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+
+
+if __name__ == "__main__":
+    model_list = os.listdir('../data/result_data/block_completion')
+    for model in model_list:
+
+        input_json_file = f'../data/result_data/block_completion/{model}/VersiCode_block_completion.json'
+        output_json_file = input_json_file
+        data = load_json(input_json_file)
+
+        for item in data:
+            core_token = item['core_token']
+            code = item['code']
+
+            _, core_line_in_code = process_line_mask(code, core_token)
+            if core_line_in_code:
+                item['core_line_in_code'] = core_line_in_code
+            else:
+                item['core_line_in_code'] = "N/A"
+
+            model_output_clear = item['model_output_clear']
+            core_line_in_output_list = []
+
+            for entry in eval(model_output_clear):
+                _, core_line_in_output = process_line_mask(entry, core_token)
+                if core_line_in_output:
+                    core_line_in_output_list.append(core_line_in_output)
+                else:
+                    core_line_in_output_list.append("N/A")
+
+            item['core_line_in_output_clear'] = core_line_in_output_list
+
+        save_json(output_json_file, data)
+        print("Done!")
+
diff --git a/evaluation/benchmarks/versicode/output_processing/choose_core_line_from_migration_versicode.py b/evaluation/benchmarks/versicode/output_processing/choose_core_line_from_migration_versicode.py
new file mode 100644
index 0000000000..bc730fed86
--- /dev/null
+++ b/evaluation/benchmarks/versicode/output_processing/choose_core_line_from_migration_versicode.py
@@ -0,0 +1,108 @@
+"""
+Find the line of code generated by the model using the block in the version code
+"""
+import os
+import re
+import json
+import random
+
+def process_line_mask(code_snippet, core_token):
+    if not core_token:
+
+        return None, None
+
+
+    replaced_lines = {}
+    lines = code_snippet.split("\n")
+
+
+    in_multi_line_comment = False
+
+
+    for i, line in enumerate(lines):
+        if in_multi_line_comment:
+
+            if ('"""' in line or "'''" in line) and not re.findall(r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line):
+                in_multi_line_comment = False
+            continue
+        elif line.strip().startswith("#"):
+
+            continue
+        elif re.findall(r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line):
+
+            continue
+        elif ('"""' in line or "'''" in line) and not re.findall(r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line):
+
+            in_multi_line_comment = True
+            continue
+        else:
+
+            if re.search(r'\bdef\s+task_function\b', line):
+                continue
+
+
+            if re.search(r'\b{}\b(?!\s*=)'.format(re.escape(core_token)), line):
+
+                replaced_lines.update({i: line})
+
+    if replaced_lines:
+        random_line_location = random.choice(list(replaced_lines.keys()))
+
+        masked_line = lines[random_line_location]
+        leading_spaces = re.match(r'^\s*', masked_line).group(0)
+        masked_line = masked_line.strip()
+        lines[random_line_location] = leading_spaces + "<line_mask>"
+
+        masked_code = '\n'.join(lines)
+
+        return masked_code, masked_line
+
+    return None, None
+
+
+def load_json(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+
+
+def save_json(file_path, data):
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=4)
+
+
+
+if __name__ == "__main__":
+    model_list = os.listdir('../data/result_data/code_migration')
+    for model in model_list:
+
+        input_json_file = f'../data/result_data/code_migration/{model}/VersiCode_migration.json'
+        output_json_file = input_json_file
+        data = load_json(input_json_file)
+
+        for item in data:
+            core_token = item['old_name']
+            code = item['old_code']
+
+            _, core_line_in_code = process_line_mask(code, core_token)
+            if core_line_in_code:
+                item['core_line_in_code'] = core_line_in_code
+            else:
+                item['core_line_in_code'] = "N/A"
+
+            model_output_clear = item['model_output_clear']
+            core_line_in_output_list = []
+
+            core_token = item['new_name']
+            for entry in eval(model_output_clear):
+                _, core_line_in_output = process_line_mask(entry, core_token)
+                if core_line_in_output:
+                    core_line_in_output_list.append(core_line_in_output)
+                else:
+                    core_line_in_output_list.append("N/A")
+
+            item['core_line_in_output_clear'] = core_line_in_output_list
+
+        save_json(output_json_file, data)
+        print("Done!")
+
diff --git a/evaluation/benchmarks/versicode/output_processing/clear_ans.py b/evaluation/benchmarks/versicode/output_processing/clear_ans.py
new file mode 100644
index 0000000000..b6d72c1ac0
--- /dev/null
+++ b/evaluation/benchmarks/versicode/output_processing/clear_ans.py
@@ -0,0 +1,36 @@
+"""
+Clear the<start>and<end>generated by the model in inference
+"""
+
+import json
+import os
+
+model_name = ''
+task = 'block_completion'
+
+result_path = f'../data/result_data/{task}/{model_name}/VersiCode_block_completion.json'    #Modify the file according to the task format
+
+
+with open(result_path, 'r', encoding='utf-8')as fr:
+    lodict = json.load(fr)
+data_dict = lodict
+data_list = data_dict
+
+for data in data_list:
+    temp_list = []
+    model_output_list = eval(data['model_output'])
+    for output in model_output_list:
+
+        if "<start>" in output and "<end>" in output:
+            start_index = output.find("<start>") + len("<start>")
+            end_index = output.find("<end>")
+            content = output[start_index:end_index].replace('```python', '').replace('```', '')
+        else:
+            content = "no_answer"
+
+        temp_list.append(content)
+
+    data['model_output_clear'] = str(temp_list)
+
+with open(result_path, 'w', encoding='utf-8')as fw:
+    json.dump(data_dict, fw, indent=4, ensure_ascii=False)
\ No newline at end of file
diff --git a/evaluation/benchmarks/versicode/requirements.txt b/evaluation/benchmarks/versicode/requirements.txt
new file mode 100644
index 0000000000..02ba5f6fb7
--- /dev/null
+++ b/evaluation/benchmarks/versicode/requirements.txt
@@ -0,0 +1,146 @@
+aiohappyeyeballs==2.6.1
+aiohttp==3.11.18
+aiosignal==1.3.2
+airportsdata==20250224
+annotated-types==0.7.0
+anyio==4.9.0
+astor==0.8.1
+attrs==25.3.0
+blake3==1.0.4
+cachetools==5.5.2
+certifi==2025.1.31
+charset-normalizer==3.4.1
+click==8.1.8
+cloudpickle==3.1.1
+compressed-tensors==0.9.3
+cupy-cuda12x==13.4.1
+Deprecated==1.2.18
+depyf==0.18.0
+dill==0.4.0
+diskcache==5.6.3
+distro==1.9.0
+dnspython==2.7.0
+einops==0.8.1
+email_validator==2.2.0
+fastapi==0.115.12
+fastapi-cli==0.0.7
+fastrlock==0.8.3
+filelock==3.18.0
+frozenlist==1.6.0
+fsspec==2025.3.2
+gguf==0.16.2
+googleapis-common-protos==1.70.0
+grpcio==1.71.0
+h11==0.14.0
+hf-xet==1.0.3
+httpcore==1.0.8
+httptools==0.6.4
+httpx==0.28.1
+huggingface-hub==0.30.2
+idna==3.10
+importlib_metadata==8.0.0
+interegular==0.3.3
+Jinja2==3.1.6
+jiter==0.9.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+lark==1.2.2
+llguidance==0.7.16
+llvmlite==0.44.0
+lm-format-enforcer==0.10.11
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mistral_common==1.5.4
+mpmath==1.3.0
+msgpack==1.1.0
+msgspec==0.19.0
+multidict==6.4.3
+nest-asyncio==1.6.0
+networkx==3.4.2
+ninja==1.11.1.4
+numba==0.61.2
+numpy==2.2.5
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-cusparselt-cu12==0.6.2
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+openai==1.75.0
+opencv-python-headless==4.11.0.86
+opentelemetry-api==1.26.0
+opentelemetry-exporter-otlp==1.26.0
+opentelemetry-exporter-otlp-proto-common==1.26.0
+opentelemetry-exporter-otlp-proto-grpc==1.26.0
+opentelemetry-exporter-otlp-proto-http==1.26.0
+opentelemetry-proto==1.26.0
+opentelemetry-sdk==1.26.0
+opentelemetry-semantic-conventions==0.47b0
+opentelemetry-semantic-conventions-ai==0.4.3
+outlines==0.1.11
+outlines_core==0.1.26
+packaging==25.0
+partial-json-parser==0.2.1.1.post5
+pillow==11.2.1
+prometheus-fastapi-instrumentator==7.1.0
+prometheus_client==0.21.1
+propcache==0.3.1
+protobuf==4.25.6
+psutil==7.0.0
+py-cpuinfo==9.0.0
+pycountry==24.6.1
+pydantic==2.11.3
+pydantic_core==2.33.1
+Pygments==2.19.1
+python-dotenv==1.1.0
+python-json-logger==3.3.0
+python-multipart==0.0.20
+PyYAML==6.0.2
+pyzmq==26.4.0
+ray==2.43.0
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+rich==14.0.0
+rich-toolkit==0.14.1
+rpds-py==0.24.0
+safetensors==0.5.3
+scipy==1.15.2
+sentencepiece==0.2.0
+setuptools==75.8.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+starlette==0.46.2
+sympy==1.13.1
+tiktoken==0.9.0
+tokenizers==0.21.1
+torch==2.6.0
+torchaudio==2.6.0
+torchvision==0.21.0
+tqdm==4.67.1
+transformers==4.51.3
+triton==3.2.0
+typer==0.15.2
+typing-inspection==0.4.0
+typing_extensions==4.13.2
+urllib3==2.4.0
+uvicorn==0.34.2
+uvloop==0.21.0
+vllm==0.8.4
+watchfiles==1.0.5
+websockets==15.0.1
+wheel==0.45.1
+wrapt==1.17.2
+xformers==0.0.29.post2
+xgrammar==0.1.18
+yarl==1.20.0
+zipp==3.21.0