From be62ba6b354582524d8fcb5d4967e0604825a73a Mon Sep 17 00:00:00 2001 From: ASTONE Date: Sat, 14 Jun 2025 23:17:18 +1000 Subject: [PATCH] add_versicode (#8221) --- evaluation/benchmarks/versicode/README.md | 103 ++++++ .../inference_utils/api_code_migration.py | 134 +++++++ .../api_test_block_completion.py | 141 +++++++ .../versicode/inference_utils/test_block.py | 118 ++++++ .../inference_utils/test_migration.py | 111 ++++++ .../versicode/metric/compute_ism_pm_score.py | 345 ++++++++++++++++++ .../metric/compute_migration_cdc_score.py | 165 +++++++++ .../metric/compute_versicode_cdc_score.py | 175 +++++++++ .../metric/compute_versicode_em_score.py | 175 +++++++++ .../choose_core_line_from_block_versicode.py | 107 ++++++ ...oose_core_line_from_migration_versicode.py | 108 ++++++ .../versicode/output_processing/clear_ans.py | 36 ++ .../benchmarks/versicode/requirements.txt | 146 ++++++++ 13 files changed, 1864 insertions(+) create mode 100644 evaluation/benchmarks/versicode/README.md create mode 100644 evaluation/benchmarks/versicode/inference_utils/api_code_migration.py create mode 100644 evaluation/benchmarks/versicode/inference_utils/api_test_block_completion.py create mode 100644 evaluation/benchmarks/versicode/inference_utils/test_block.py create mode 100644 evaluation/benchmarks/versicode/inference_utils/test_migration.py create mode 100644 evaluation/benchmarks/versicode/metric/compute_ism_pm_score.py create mode 100644 evaluation/benchmarks/versicode/metric/compute_migration_cdc_score.py create mode 100644 evaluation/benchmarks/versicode/metric/compute_versicode_cdc_score.py create mode 100644 evaluation/benchmarks/versicode/metric/compute_versicode_em_score.py create mode 100644 evaluation/benchmarks/versicode/output_processing/choose_core_line_from_block_versicode.py create mode 100644 evaluation/benchmarks/versicode/output_processing/choose_core_line_from_migration_versicode.py create mode 100644 evaluation/benchmarks/versicode/output_processing/clear_ans.py create mode 100644 evaluation/benchmarks/versicode/requirements.txt diff --git a/evaluation/benchmarks/versicode/README.md b/evaluation/benchmarks/versicode/README.md new file mode 100644 index 0000000000..2bd4fba58f --- /dev/null +++ b/evaluation/benchmarks/versicode/README.md @@ -0,0 +1,103 @@ +# VersiCode benchmark + +This project is used to evaluate the performance of the model on VersiCode. It includes: + +- data: the test data needed and the model outputs +- inference_utils: inference scripts for ours tasks and models +- metric: scripts for calculating various metric +- output_processing: process the model output to facilitate the calculation of model metrics + +# Details + +1. **Prepare the environment** + + ```shell + #create conda environment + conda create -n VersiCode python==3.12 + + #install requirements + pip install -r requirements.txt + ``` + +2. **Experiment Data** + + To obtain the experimental data, please visit the Hugging Face link: https://huggingface.co/datasets/AstoneNg/VersiCode. + Locate the files `VersiCode_block_completion.json` and `VersiCode_migration.json` under the `experiment_data` directory, and place them in the `/data/test_data directory` of this project. + + +3. **Model inference** + + ```shell + #cd inference_utils directory + cd inference_utils + + #The script file starting with 'test' is used to test the local model + #The script file at the beginning of the API is used to test the API call model + + #block level code completipn + #Modify the 10th and 12th lines of code to specify the base URL and model name + python api_test_block_completion.py + #Modify the 30th line of code to specify the local model path + python test_block.py + + # code migration (migration order is 'old_to_new') + #Modify the 10th and 12th lines of code to specify the base URL and model name + python api_code_migration.py + #Modify the 30th line of code to specify the local model path + python test_migration.py + ``` + +4. **Process output** + Process the output content of the model, remove redundant content, extract specified content for easy calculation of indicators. + + ```shell + #cd output_processing + cd output_processing + + #Extract content from and + #Modify the 8th and 9th lines of code to specify the model and task granularity + python clear_ans.py + + #In the block completion task and migration task, cdc@k The calculation of indicators needs to be targeted at key rows, + #Modify lines 76 and 79 to specify the data path + python choose_core_line_from_block_versicode.py + python choose_core_line_from_migration_versicode.py + ``` + +5. **Metric** + We have three metrics pass@k,em@k and cdc@k Due to our inability to automatically build a dynamic evaluation environment, we have not provided pass@k . + + ```shell + #cd metric + cd metric + + #Modify lines 137-140 in migration task (compute_migration_cdc_score.py) or 143-145 in block and line completion task (compute_versicode_cdc_score.py and compute_versicode_em_score.py) of the code to specify the data path and calculate the k-value of the metric + python compute_migration_cdc_score.py + python compute_versicode_cdc_score.py + python compute_versicode_em_score.py + + #Notes + #We found limitations in the ISM@k and PM@k metrics for evaluating code generation, so they are used only as reference in our experiments. + #Modify lines 261-265 in block and line completion task of the code to specify the data path and calculate the k-value of the metric + python compute_ism_pm_score.py + ``` + +# Citation + +``` +@article{versicode, + author={Tongtong Wu and Weigang Wu and Xingyu Wang and Kang Xu and Suyu Ma and Bo Jiang and Ping Yang and Zhenchang Xing and Yuan-Fang Li and Gholamreza Haffari}, + title = {VersiCode: Towards Version-controllable Code Generation}, + journal = {CoRR}, + volume = {abs/2406.07411}, + year = {2024}, + url = {https://arxiv.org/abs/2406.07411}, +} +``` + +**Github url**: https://github.com/wutong8023/VersiCode + +# Contributor + +[Tongtong Wu](https://scholar.google.com/citations?hl=zh-CN&user=u1Qp8lUAAAAJ&view_op=list_works&sortby=pubdate), [Weigang Wu](https://scholar.google.com/citations?hl=zh-CN&user=UneIZo8AAAAJ), [Xingyu Wang](https://scholar.google.com/citations?hl=zh-CN&user=wqPJcxcAAAAJ), [Kang Xu](https://scholar.google.com/citations?hl=zh-CN&user=N1UUDi0AAAAJ), [Suyu Ma](https://scholar.google.com/citations?hl=zh-CN&user=NJHR1ukAAAAJ), [Bo Jiang](https://wutong8023.site/VersiCode/), [Ping Yang](https://scholar.google.com/citations?view_op=list_works&hl=en&hl=en&user=hrogvxoAAAAJ), [Zhenchang Xing](https://scholar.google.com/citations?hl=zh-CN&user=0vCxuH4AAAAJ), [Yuan-Fang Li](https://scholar.google.com/citations?hl=zh-CN&user=wufXO1kAAAAJ), [Gholamreza Haffari](https://scholar.google.com/citations?hl=zh-CN&user=Perjx5EAAAAJ) + diff --git a/evaluation/benchmarks/versicode/inference_utils/api_code_migration.py b/evaluation/benchmarks/versicode/inference_utils/api_code_migration.py new file mode 100644 index 0000000000..c99c9d0385 --- /dev/null +++ b/evaluation/benchmarks/versicode/inference_utils/api_code_migration.py @@ -0,0 +1,134 @@ +""" +GPT performs line level generation prediction and truncates overly long tokens +""" +import json +import openai +from openai import OpenAI +import os +import tiktoken +max_tokens = 127000 #gpt3.5 is 16ktoken gpt4o is 128k +model_name = "" + +os.environ["OPENAI_API_KEY"] = "" +client = OpenAI() + +def truncate_text(text, max_tokens): + encoding = tiktoken.get_encoding("cl100k_base") + disallowed_special = () + + tokens = encoding.encode(text, disallowed_special=disallowed_special) + print(len(tokens)) + + if len(tokens) > max_tokens: + tokens = tokens[:max_tokens] + + truncated_text = encoding.decode(tokens) + + return truncated_text + +def predict(content, model_name): + response = client.chat.completions.create( + model=model_name, + messages=[ + { + "role": "user", + "content": content + } + ], + frequency_penalty=0.1, + max_tokens=128, + logit_bias=None, + logprobs=None, + n=6, + presence_penalty=0.0, + seed=None, + stop=None, + stream=False, + temperature=0.8, + top_p=0.95 + ) + ans_list = [] + choices_list = response.choices + for c in choices_list: + content = c.message.content + ans_list.append(content) + final_ans = str(ans_list) + return final_ans + +def bulid_prompt(description, old_version, old_code, new_version) -> str: + """ + build prompt + :param version: + :param description: + :param masked_code: + :param options: + :return: + """ + prompt = f""" + You are now a professional Python programming engineer. I will provide you with a code snippet and a description of its functionality, + including the dependencies and versions used in the code. Then, I will provide the same dependencies but with a specified new version. + Your task is to refactor the code using the methods provided by the specified new version and return the refactored code. + Please note that you only need to return the refactored code and enclose it with and : + ###Functionality description of the code + {description} + ###Dependency and old version + {old_version} + ###Old version code + {old_code} + ###Dependency and new version + {new_version} + ###Refactored new code + """ + + return prompt + + +json_path = '../data/test_data/VersiCode_migration.json' + + +with open(json_path, 'r', encoding='utf-8')as fr: + lodict = json.load(fr) +data_dict = lodict +data_list = data_dict + + +for data in data_list: + if "model_output" in data: + print(f"the {data_list.index(data) + 1} has already been predicted, skipping this data!") + continue + try: + print(f"Predicting {data_list.index(data) + 1} ") + old_version = data['dependency'] + data['old_version'] # package == x.x.x + new_version = data['dependency'] + data['new_version'] # package == x.x.x + description = data['description'] # 功能描述 + old_code = data['old_code'] # mask后的代码 + + instruction = bulid_prompt(description, old_version, old_code, new_version) + truncated_text = truncate_text(instruction, max_tokens) + prediction = predict(truncated_text, model_name) + + data['model_output'] = prediction + except Exception as e: + print(f"error:{e}") + print("save current data") + save_folder_path = os.path.join('../data/result_data/code_migration', model_name) + if not os.path.exists(save_folder_path): + os.makedirs(save_folder_path) + save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1]) + + with open(save_json_path, 'w', encoding='utf-8') as fw: + json.dump(data_dict, fw, indent=4, ensure_ascii=False) + break + + + +save_folder_path = os.path.join('../data/result_data/code_migration', model_name) +if not os.path.exists(save_folder_path): + os.makedirs(save_folder_path) +save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1]) + +with open(save_json_path, 'w', encoding='utf-8')as fw: + json.dump(data_dict, fw, indent=4, ensure_ascii=False) + + + diff --git a/evaluation/benchmarks/versicode/inference_utils/api_test_block_completion.py b/evaluation/benchmarks/versicode/inference_utils/api_test_block_completion.py new file mode 100644 index 0000000000..796672f0b0 --- /dev/null +++ b/evaluation/benchmarks/versicode/inference_utils/api_test_block_completion.py @@ -0,0 +1,141 @@ +""" +GPT performs line level generation prediction and truncates overly long tokens +""" +import json +import openai +from openai import OpenAI +import os +import tiktoken +max_tokens = 127000 #gpt3.5 is 16ktoken gpt4o is 128k +model_name = "" + +os.environ["OPENAI_API_KEY"] = "" +client = OpenAI() + +def truncate_text(text, max_tokens): + encoding = tiktoken.get_encoding("cl100k_base") + disallowed_special = () + + tokens = encoding.encode(text, disallowed_special=disallowed_special) + print(len(tokens)) + + if len(tokens) > max_tokens: + tokens = tokens[:max_tokens] + + truncated_text = encoding.decode(tokens) + + return truncated_text + +def predict(content, model_name): + response = client.chat.completions.create( + model=model_name, + messages=[ + { + "role": "user", + "content": content + } + ], + frequency_penalty=0.1, + max_tokens=128, + logit_bias=None, + logprobs=None, + n=6, + presence_penalty=0.0, + seed=None, + stop=None, + stream=False, + temperature=0.8, + top_p=0.95 + ) + ans_list = [] + choices_list = response.choices + for c in choices_list: + content = c.message.content + ans_list.append(content) + final_ans = str(ans_list) + return final_ans + +def bulid_prompt(version, description) -> str: + """ + build prompt + :param version: + :param description: + :param masked_code: + :param options: + :return: + """ + prompt = f''' + You are a professional Python engineer, and I will provide functional descriptions and versions of specified dependency packages. + You need to write code in Python to implement this feature based on the functional description and using the dependency package and version I specified. + Please note that you only need to return the code that implements the function, and do not return any other content. + Please use and to enclose the generated code. Here is an example: + ###Function Description: + The function of this code is to print the results predicted by calling the model using vllm. + ###dependeny and version: + vllm==0.3.3 + ###response: + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print("Prompt,Generated text") + + + ###Function Description: + {description} + ###dependeny and version: + {version} + ###response: + + + ''' + return prompt + + +json_path = '../data/test_data/VersiCode_block_completion.json' + + +with open(json_path, 'r', encoding='utf-8')as fr: + lodict = json.load(fr) +data_dict = lodict +data_list = data_dict + + +for data in data_list: + if "model_output" in data: + print(f"the {data_list.index(data) + 1} has already been predicted, skipping this data!") + continue + try: + print(f"Predicting {data_list.index(data) + 1} ") + version = data['dependency'] + data['version'] # package == x.x.x + description = data['description'] # func description + + instruction = bulid_prompt(version, description) + truncated_text = truncate_text(instruction, max_tokens) + prediction = predict(truncated_text, model_name) + + data['model_output'] = prediction + except Exception as e: + print(f"error:{e}") + print("save current data") + save_folder_path = os.path.join('../data/result_data/block_completion', model_name) + if not os.path.exists(save_folder_path): + os.makedirs(save_folder_path) + save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1]) + + with open(save_json_path, 'w', encoding='utf-8') as fw: + json.dump(data_dict, fw, indent=4, ensure_ascii=False) + break + + + +save_folder_path = os.path.join('../data/result_data/block_completion', model_name) +if not os.path.exists(save_folder_path): + os.makedirs(save_folder_path) +save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1]) + +with open(save_json_path, 'w', encoding='utf-8')as fw: + json.dump(data_dict, fw, indent=4, ensure_ascii=False) + + + diff --git a/evaluation/benchmarks/versicode/inference_utils/test_block.py b/evaluation/benchmarks/versicode/inference_utils/test_block.py new file mode 100644 index 0000000000..1d34e55af9 --- /dev/null +++ b/evaluation/benchmarks/versicode/inference_utils/test_block.py @@ -0,0 +1,118 @@ +""" +block completion +""" +import copy +import json +import os +from vllm import LLM, SamplingParams +import tiktoken +import time +import gc +import torch +from multiprocessing import Process + +# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" + +def truncate_text(text, max_tokens): + encoding = tiktoken.get_encoding("cl100k_base") + disallowed_special = () + + tokens = encoding.encode(text, disallowed_special=disallowed_special) + print(len(tokens)) + + if len(tokens) > max_tokens: + tokens = tokens[:max_tokens] + + truncated_text = encoding.decode(tokens) + + return truncated_text + +model_list = ['/data2/base models/starcoder2-15b', '/data2/base models/CodeGemma-7B'] + +def run_inference(model_name, origin_data_list): + temp_data_list = copy.deepcopy(origin_data_list) + test_list = [] + for data in temp_data_list: + version = data['dependency'] + data['version'] # package == x.x.x + description = data['description'] # func description + + instruction = bulid_prompt(version, description) + test_list.append(instruction) + + sampling_params = SamplingParams(n=6, temperature=0.8, top_p=0.95, max_tokens=64) + llm = LLM(model=model_name, tensor_parallel_size=4, gpu_memory_utilization=0.9, swap_space=20) + + outputs = llm.generate(test_list, sampling_params) + for output in outputs: + requests_id = int(output.request_id) + temp_ans_list = [] + output_list = output.outputs + for o in output_list: + text = o.text + temp_ans_list.append(text) + + temp_data_list[requests_id]['model_output'] = str(temp_ans_list) + + save_folder_path = os.path.join('../data/result_data/block_completion', model_name.split('/')[-1]) + if not os.path.exists(save_folder_path): + os.makedirs(save_folder_path) + + save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1]) + + with open(save_json_path, 'w', encoding='utf-8') as fw: + json.dump(temp_data_list, fw, indent=4, ensure_ascii=False) + + gc.collect() + torch.cuda.empty_cache() + + +def bulid_prompt(version, description) -> str: + """ + build prompt + :param version: + :param description: + :param masked_code: + :param options: + :return: + """ + prompt = f''' + You are a professional Python engineer, and I will provide functional descriptions and versions of specified dependency packages. + You need to write code in Python to implement this feature based on the functional description and using the dependency package and version I specified. + Please note that you only need to return the code that implements the function, and do not return any other content. + Please use and to enclose the generated code. Here is an example: + ###Function Description: + The function of this code is to print the results predicted by calling the model using vllm. + ###dependeny and version: + vllm==0.3.3 + ###response: + + for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print("Prompt,Generated text") + + + ###Function Description: + {description} + ###dependeny and version: + {version} + ###response: + + + ''' + return prompt + + +json_path = '../data/test_data/VersiCode_block_completion.json' + +with open(json_path, 'r', encoding='utf-8')as fr: + lodict = json.load(fr) + +origin_data_list = lodict + +for model_name in model_list: + process = Process(target=run_inference, args=(model_name, origin_data_list)) + process.start() + process.join() + time.sleep(120) + diff --git a/evaluation/benchmarks/versicode/inference_utils/test_migration.py b/evaluation/benchmarks/versicode/inference_utils/test_migration.py new file mode 100644 index 0000000000..b088f69773 --- /dev/null +++ b/evaluation/benchmarks/versicode/inference_utils/test_migration.py @@ -0,0 +1,111 @@ +""" +code migration +""" +import copy +import json +import os +from vllm import LLM, SamplingParams +import tiktoken +import time +import gc +import torch +from multiprocessing import Process + +# os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" + +def truncate_text(text, max_tokens): + encoding = tiktoken.get_encoding("cl100k_base") + disallowed_special = () + + tokens = encoding.encode(text, disallowed_special=disallowed_special) + print(len(tokens)) + + if len(tokens) > max_tokens: + tokens = tokens[:max_tokens] + + truncated_text = encoding.decode(tokens) + + return truncated_text + +model_list = ['/data2/base models/starcoder2-15b', '/data2/base models/CodeGemma-7B'] + +def run_inference(model_name, origin_data_list): + temp_data_list = copy.deepcopy(origin_data_list) + test_list = [] + for data in temp_data_list: + old_version = data['dependency'] + data['old_version'] # package == x.x.x + new_version = data['dependency'] + data['new_version'] # package == x.x.x + description = data['description'] # 功能描述 + old_code = data['old_code'] # mask后的代码 + + instruction = bulid_prompt(description, old_version, old_code, new_version) + test_list.append(instruction) + + sampling_params = SamplingParams(n=6, temperature=0.8, top_p=0.95, max_tokens=512) + llm = LLM(model=model_name, tensor_parallel_size=4, gpu_memory_utilization=0.6, swap_space=40) + + outputs = llm.generate(test_list, sampling_params) + for output in outputs: + requests_id = int(output.request_id) + temp_ans_list = [] + output_list = output.outputs + for o in output_list: + text = o.text + temp_ans_list.append(text) + + temp_data_list[requests_id]['model_output'] = str(temp_ans_list) + + save_folder_path = os.path.join('../data/result_data/code_migration', model_name.split('/')[-1]) + if not os.path.exists(save_folder_path): + os.makedirs(save_folder_path) + + save_json_path = os.path.join(save_folder_path, json_path.split('/')[-1]) + + with open(save_json_path, 'w', encoding='utf-8') as fw: + json.dump(temp_data_list, fw, indent=4, ensure_ascii=False) + + gc.collect() + torch.cuda.empty_cache() + + +def bulid_prompt(description, old_version, old_code, new_version) -> str: + """ + build prompt + :param version: + :param description: + :param masked_code: + :param options: + :return: + """ + prompt = f""" + You are now a professional Python programming engineer. I will provide you with a code snippet and a description of its functionality, + including the dependencies and versions used in the code. Then, I will provide the same dependencies but with a specified new version. + Your task is to refactor the code using the methods provided by the specified new version and return the refactored code. + Please note that you only need to return the refactored code and enclose it with and : + ###Functionality description of the code + {description} + ###Dependency and old version + {old_version} + ###Old version code + {old_code} + ###Dependency and new version + {new_version} + ###Refactored new code + """ + + return prompt + + +json_path = '../data/test_data/VersiCode_migration.json' + +with open(json_path, 'r', encoding='utf-8')as fr: + lodict = json.load(fr) + +origin_data_list = lodict + +for model_name in model_list: + process = Process(target=run_inference, args=(model_name, origin_data_list)) + process.start() + process.join() + time.sleep(120) + diff --git a/evaluation/benchmarks/versicode/metric/compute_ism_pm_score.py b/evaluation/benchmarks/versicode/metric/compute_ism_pm_score.py new file mode 100644 index 0000000000..b0f79f025b --- /dev/null +++ b/evaluation/benchmarks/versicode/metric/compute_ism_pm_score.py @@ -0,0 +1,345 @@ +""" +评测block的预测能力 +1、判断是否包含正确的函数名 +2、判断是否合法 +3、计算ISM,和PM +""" +import json +import tokenize +import io +import math +import ast +import re +import os + +def is_code_valid(code): + + try: + compile(code, '', 'exec') + return True + except: + return False + + +def longest_common_prefix_between_lists_with_elements(list1, list2): + """ + 计算两个字符串列表中元素的最长前缀匹配长度 + :param list1: + :param list2: + :return: + """ + max_prefix_length = 0 + max_prefix_elements = () + for str1 in list1: + for str2 in list2: + prefix_length = 0 + min_len = min(len(str1), len(str2)) + for i in range(min_len): + if str1[i] == str2[i]: + prefix_length += 1 + else: + break + if prefix_length > max_prefix_length: + max_prefix_length = prefix_length + max_prefix_elements = (str1, str2) + return max_prefix_length, max_prefix_elements + +def get_token(ans_code:str, output_code:str): + """ + 对代码进行词法分析,分解成标识符,返回两个标识符列表 + :param ans_code: + :param output_code: + :return: + """ + output_flag = True + ans_flag = True + try: + tokens_ans = tokenize.tokenize(io.BytesIO(ans_code.encode('utf-8')).readline) + except Exception as e: + tokens_ans = ans_code.splitlines() + ans_flag = False + + try: + tokens_output = tokenize.tokenize(io.BytesIO(output_code.encode('utf-8')).readline) + except Exception as e: + tokens_output = output_code.splitlines() + output_flag = False + + + identifiers_ans = [] + identifiers_output = [] + if ans_flag == True: + try: + for token in tokens_ans: + if token.type == tokenize.NAME: + identifiers_ans.append(token.string) + except Exception as e: + identifiers_ans = tokens_ans + else: + identifiers_ans = tokens_ans + + if output_flag == True: + try: + for to in tokens_output: + if to.type == tokenize.NAME: + identifiers_output.append(to.string) + except Exception as e: + identifiers_output = tokens_output + else: + identifiers_output = tokens_output + + + return identifiers_ans, identifiers_output + + +def get_token_per_line(code: str): + """ + 对每一行代码进行词法分析,记录每一行的标识符 + :param code: 代码字符串 + :return: 每一行的标识符列表组成的列表 + """ + lines = code.split('\n') # 将代码按行分割成列表 + identifiers_per_line = [] # 用于存储每一行的标识符列表的列表 + + for line in lines: + tokens = tokenize.tokenize(io.BytesIO(line.encode('utf-8')).readline) + identifiers = [] + try: + for token in tokens: + if token.type == tokenize.NAME: + identifiers.append(token.string) + except: + identifiers = line.split(' ') + identifiers_per_line.append(identifiers) + + return identifiers_per_line + + + +def get_ISM(answer_code:str, model_output_list:list, asnwer_name:str)->list: + """ + 计算ISM,返回一个有序的得分列表 + :return: + """ + score_list = [] + for code in model_output_list: + if '```python' in code: + code = code.replace('```python', '') + code = code.replace('```', '') + if not re.search(rf'\b{re.escape(asnwer_name)}\b', code) or is_code_valid(code) == False: + score_list.append(0) + continue + + # if asnwer_name not in code: + # score_list.append(0) + # continue + + identifiers_ans, identifiers_output = get_token(answer_code, code) + max_len, elements = longest_common_prefix_between_lists_with_elements(identifiers_ans, identifiers_output) + if max_len != 0: + base_element_len = max(len(elements[0]), len(elements[1])) + temp_score = max_len/base_element_len + score_list.append(temp_score) + else: + score_list.append(0) + # base_element_len = max(len(elements[0]), len(elements[1])) + # temp_score = max_len/base_element_len + # score_list.append(temp_score) + + score_list = sorted(score_list, reverse=True) + return score_list + +def get_ISM_without_verification(answer_code:str, model_output_list:list, asnwer_name:str)->list: + """ + 计算ISM,返回一个有序的得分列表 + :return: + """ + score_list = [] + for code in model_output_list: + + if asnwer_name not in code: + score_list.append(0) + continue + + # if asnwer_name not in code: + # score_list.append(0) + # continue + + identifiers_ans, identifiers_output = get_token(answer_code, code) + max_len, elements = longest_common_prefix_between_lists_with_elements(identifiers_ans, identifiers_output) + if max_len != 0: + base_element_len = max(len(elements[0]), len(elements[1])) + temp_score = max_len/base_element_len + score_list.append(temp_score) + else: + score_list.append(0) + # base_element_len = max(len(elements[0]), len(elements[1])) + # temp_score = max_len/base_element_len + # score_list.append(temp_score) + + score_list = sorted(score_list, reverse=True) + return score_list + +def longest_common_prefix_with_lengths(list1, list2): + """ + 计算两个二维列表中每个子列表的最长前缀匹配长度,并记录拥有最长前缀匹配长度的两个子列表的长度 + :param list1: 第一个二维列表 + :param list2: 第二个二维列表 + :return: 最长前缀匹配长度以及拥有最长前缀匹配长度的两个子列表的长度 + """ + max_length = 0 + len_list1 = 0 + len_list2 = 0 + for i, sublist1 in enumerate(list1): + for j, sublist2 in enumerate(list2): + match_length = 0 + min_length = min(len(sublist1), len(sublist2)) + for k in range(min_length): + if sublist1[k] == sublist2[k]: + match_length += 1 + else: + break + if match_length > max_length: + max_length = match_length + len_list1 = len(sublist1) + len_list2 = len(sublist2) + return max_length, len_list1, len_list2 + + +def get_PM(answer_code:str, model_output_list:list, asnwer_name:str)->list: + """ + 计算PM,返回一个有序的得分列表 + :return: + """ + score_list = [] + for code in model_output_list: + if '```python' in code: + code = code.replace('```python', '') + code = code.replace('```', '') + if not re.search(rf'\b{re.escape(asnwer_name)}\b', code) or is_code_valid(code) == False: + + # if asnwer_name not in code or is_code_valid(code) == False: + score_list.append(0) + continue + + # if asnwer_name not in code: + # score_list.append(0) + # continue + + ans_list = get_token_per_line(answer_code) + output_token_list = get_token_per_line(code) + max_len, len1, len2 = longest_common_prefix_with_lengths(ans_list, output_token_list) + base_element_len = max(len1, len2) + + if base_element_len != 0: + temp_score = max_len/base_element_len + score_list.append(temp_score) + else: + score_list.append(0) + + score_list = sorted(score_list, reverse=True) + return score_list + +def get_score(score_list:list, k): + """ + 计算score@n,k + :param score_list: + :param k: + :return: + """ + n = len(score_list) + sum = 0 + final = n-k+1 + for i in range(1, final+1): + sum += math.comb(n-i, k-1) * score_list[i-1] + + final_score = sum/math.comb(n, k) + + return final_score + + +k = 1 +task = 'block' # block or line +json_name = f"Versicode_{task}_completion.json" + +folder_path = f'../data/result_data/{task}_completion' +model_list = os.listdir(folder_path) + +for model in model_list: + model_json_path = os.path.join(folder_path, model, json_name) + with open(model_json_path, 'r', encoding='utf-8')as fr: + lodict = json.load(fr) + data_dict = lodict + data_list = data_dict + data_len = len(data_list) + sum_ISM = 0 + sum_PM = 0 + + for data in data_list: + # model_output_list = eval(data['model_output']) + model_output_list = eval(data['model_output_clear'])[:1] + temp_list = [] + for o in model_output_list: + temp_out = o.replace('```python', '') + temp_out = temp_out.replace('```', '') + temp_list.append(temp_out) + model_output_list = temp_list + answer_code = data['code'] + answer_name = data['core_token'] + # + # answer_code = data['new_code'] #code editing + # answer_name = data['new_name'] #code editing + + # answer_code = data['old_code'] # code editing new to old + # answer_name = data['old_name'] # code editing new to old + # + ISM_score_list = get_ISM(answer_code, model_output_list, answer_name) + # ISM_score_without_verification_list = get_ISM_without_verification(answer_code, model_output_list, answer_name) #新增 + PM_score_list = get_PM(answer_code, model_output_list, answer_name) + + # if not ISM_score_without_verification_list == ISM_score_list:#新增 + # for s in ISM_score_list:#新增 + # if s != ISM_score_without_verification_list[ISM_score_list.index(s)]:#新增 + # print('元数据如下')#新增 + # print(data)#新增 + # print('答案如下')#新增 + # print(model_output_list[ISM_score_list.index(s)])#新增 + + # flag = int(input('输入1继续,0退出'))#新增 + # if flag == 1: + # continue + + + ISM_score = get_score(ISM_score_list, k) + PM_score = get_score(PM_score_list, k) + + sum_ISM += ISM_score + sum_PM += PM_score + # print(f"ISM分数:{ISM_score}") + # print(f"PM分数:{PM_score}") + + print(f"{model}, {task} completion task, ISM@{k} score: {sum_ISM/data_len}") + print(f"{model}, {task} completion task, PM@{k} score: {sum_PM/data_len}") + + + +# def get_token(ans_code:str, output_code:str): +# """ +# 对代码进行词法分析,分解成标识符,返回两个标识符列表 +# :param ans_code: +# :param output_code: +# :return: +# """ +# tokens_ans = tokenize.tokenize(io.BytesIO(ans_code.encode('utf-8')).readline) +# tokens_output = tokenize.tokenize(io.BytesIO(output_code.encode('utf-8')).readline) +# identifiers_ans = [] +# identifiers_output = [] +# for token in tokens_ans: +# if token.type == tokenize.NAME: +# identifiers_ans.append(token.string) +# +# for to in tokens_output: +# if to.type == tokenize.NAME: +# identifiers_output.append(to.string) +# +# return identifiers_ans, identifiers_output \ No newline at end of file diff --git a/evaluation/benchmarks/versicode/metric/compute_migration_cdc_score.py b/evaluation/benchmarks/versicode/metric/compute_migration_cdc_score.py new file mode 100644 index 0000000000..836470c6a3 --- /dev/null +++ b/evaluation/benchmarks/versicode/metric/compute_migration_cdc_score.py @@ -0,0 +1,165 @@ +""" +Calculate the cdc score for migration +""" +import os +import json +import math +import re +import warnings +# warnings.filterwarnings("ignore", category=SyntaxWarning) + +def is_correct_parameter_count(function_name, correct_code, test_code): + """ + 判断参数数量是否一致 + :param function_name: + :param correct_code: + :param test_code: + :return: + """ + # 获取正确代码中的参数数量 + # return True + pattern = rf'{function_name}\((.*?)\)' + correct_match = re.search(pattern, correct_code) + + if correct_match: + correct_params = correct_match.group(1).strip() + correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()] + expected_count = len(correct_param_list) + else: + expected_count = 0 # 如果没有参数,期望数量为0 + + # 在需要判断的代码中查找函数调用 + test_match = re.search(pattern, test_code) + + if test_match: + test_params = test_match.group(1).strip() + test_param_list = [p.strip() for p in test_params.split(',') if p.strip()] + return len(test_param_list) == expected_count # 检查参数数量 + else: + # 如果没有括号,检查函数名是否在字符串中 + return expected_count == 0 and function_name in test_code + +def check_keyword_parameters(function_name, correct_code, test_code): + """ + 判断关键词参数赋值是否正确使用 + :param function_name: + :param correct_code: + :param test_code: + :return: + """ + # 正则表达式匹配正确代码中的函数调用 + # return True + pattern = rf'{function_name}\((.*?)\)' + correct_match = re.search(pattern, correct_code) + + if correct_match: + correct_params = correct_match.group(1).strip() + correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()] + + # 检查待检测代码中的函数调用 + test_match = re.search(pattern, test_code) + + if test_match: + test_params = test_match.group(1).strip() + test_param_list = [p.strip() for p in test_params.split(',') if p.strip()] + + # 确保待检测的每个参数都以关键字参数形式赋值 + for correct_param in correct_param_list: + if '=' in correct_param: # 仅当正确代码中有关键词参数 + param_name = correct_param.split('=')[0].strip() + if not any(param_name in test_param and '=' in test_param for test_param in test_param_list): + return False # 如果对应参数不是关键词参数,则返回False + + return True # 所有关键字参数匹配 + + return False # 如果没有匹配,返回False + +def with_correct(answer_code:str, model_output:str)->bool: + """ + 当answer是with结构时,判断模型生成的是不是with结构 + :param answer_code: + :param model_output: + :return: + """ + # return True + if not answer_code.startswith('with') and not model_output.startswith('with'): + return True + elif answer_code.startswith('with') and model_output.startswith('with'): + return True + else: + return False + +def compute_block_score_k(answer:str, model_output:list, k:int, model_filled_code, core_line_in_core_block, core_line_in_output_clear): + """ + cdc需要满足五个条件,em只需要满足第一个条件 + """ + c = 0 + n = len(model_output) + for index, code in enumerate(model_output): + if re.search(rf'\b{re.escape(answer)}\b', code) and is_code_valid(model_filled_code[index]) and is_correct_parameter_count(answer, core_line_in_core_block, core_line_in_output_clear[index]) and with_correct(core_line_in_core_block, core_line_in_output_clear[index]) and check_keyword_parameters(answer, core_line_in_core_block, core_line_in_output_clear[index]):#block + # if re.search(rf'\b{re.escape(answer)}\b', code):#block + c += 1 + if n-c < k: + return 1.0 + + score = 1 - (math.comb(n - c, k))/(math.comb(n, k)) + + return score + + +def is_code_valid(code): + + try: + compile(code, '', 'exec') + return True + except: + return False + +def compute_score_k(answer:str, model_output:list, k:int): + + c = 0 + n = len(model_output) + for output in model_output: + if '```python' in output: + output = output.replace('```python', '') + output = output.replace('```', '') + # if answer == output: + + if re.search(rf'\b{re.escape(answer)}\b', output) and is_code_valid(output) == True: + c += 1 + if n-c < k: + return 1.0 + + score = 1 - (math.comb(n - c, k))/(math.comb(n, k)) + + return score + +k = 1 #cdc@k +json_name = 'VersiCode_migration.json' +task = 'migration' +folder_path = f'../data/result_data/code_migration' + +model_list = os.listdir(folder_path) +for model in model_list: + # if model != 'gpt-4o': + # continue + model_json_path = os.path.join(folder_path, model, json_name) + with open(model_json_path, 'r', encoding='utf-8')as fr: + lodict = json.load(fr) + data_list = lodict + + score_list = [] + for data in data_list: + answer = data['new_name']# old -> new + model_output = data[f'model_output_clear']# old -> new + + model_filled_code = model_output + # core_line_in_core_block = data['core_line_in_new_core_block']# old -> new + core_line_in_core_block = data['core_line_in_code'] # old -> new + core_line_in_output_clear = data['core_line_in_output_clear']# old -> new + + + score_list.append(compute_block_score_k(answer, model_output, k, model_filled_code, core_line_in_core_block, core_line_in_output_clear)) + + final_score = sum(score_list)/len(score_list) + print(f"{model}, {task} task, cdc@{k} score: {final_score}") diff --git a/evaluation/benchmarks/versicode/metric/compute_versicode_cdc_score.py b/evaluation/benchmarks/versicode/metric/compute_versicode_cdc_score.py new file mode 100644 index 0000000000..81c3cc4dfa --- /dev/null +++ b/evaluation/benchmarks/versicode/metric/compute_versicode_cdc_score.py @@ -0,0 +1,175 @@ +""" +Calculate the cdc score for line and block +""" +import os +import json +import math +import re +import warnings +# warnings.filterwarnings("ignore", category=SyntaxWarning) + +def is_code_valid(code): + + try: + compile(code, '', 'exec') + return True + except: + return False + +def is_correct_parameter_count(function_name, correct_code, test_code): + """ + 判断参数数量是否一致 + :param function_name: + :param correct_code: + :param test_code: + :return: + """ + # 获取正确代码中的参数数量 + # return True + pattern = rf'{function_name}\((.*?)\)' + correct_match = re.search(pattern, correct_code) + + if correct_match: + correct_params = correct_match.group(1).strip() + correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()] + expected_count = len(correct_param_list) + else: + expected_count = 0 # 如果没有参数,期望数量为0 + + # 在需要判断的代码中查找函数调用 + test_match = re.search(pattern, test_code) + + if test_match: + test_params = test_match.group(1).strip() + test_param_list = [p.strip() for p in test_params.split(',') if p.strip()] + return len(test_param_list) == expected_count # 检查参数数量 + else: + # 如果没有括号,检查函数名是否在字符串中 + return expected_count == 0 and function_name in test_code + +def check_keyword_parameters(function_name, correct_code, test_code): + """ + 判断关键词参数赋值是否正确使用 + :param function_name: + :param correct_code: + :param test_code: + :return: + """ + # 正则表达式匹配正确代码中的函数调用 + # return True + pattern = rf'{function_name}\((.*?)\)' + correct_match = re.search(pattern, correct_code) + + if correct_match: + correct_params = correct_match.group(1).strip() + correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()] + + # 检查待检测代码中的函数调用 + test_match = re.search(pattern, test_code) + + if test_match: + test_params = test_match.group(1).strip() + test_param_list = [p.strip() for p in test_params.split(',') if p.strip()] + + # 确保待检测的每个参数都以关键字参数形式赋值 + for correct_param in correct_param_list: + if '=' in correct_param: # 仅当正确代码中有关键词参数 + param_name = correct_param.split('=')[0].strip() + if not any(param_name in test_param and '=' in test_param for test_param in test_param_list): + return False # 如果对应参数不是关键词参数,则返回False + + return True # 所有关键字参数匹配 + + return False # 如果没有匹配,返回False + +def with_correct(answer_code:str, model_output:str)->bool: + """ + 当answer是with结构时,判断模型生成的是不是with结构 + :param answer_code: + :param model_output: + :return: + """ + # return True + if not answer_code.startswith('with') and not model_output.startswith('with'): + return True + elif answer_code.startswith('with') and model_output.startswith('with'): + return True + else: + return False + +def compute_line_score_k(answer:str, model_output:list, k:int, model_filled_code, core_line): + + c = 0 + n = len(model_output) + for index, code in enumerate(model_output): + if re.search(rf'\b{re.escape(answer)}\b', code) and is_code_valid(model_filled_code[index]) == True and is_correct_parameter_count(answer, core_line, code) and with_correct(core_line, code) and check_keyword_parameters(answer, core_line, code):#line + c += 1 + if n-c < k: + return 1.0 + + score = 1 - (math.comb(n - c, k))/(math.comb(n, k)) + + return score + +def compute_block_score_k(answer:str, model_output:list, k:int, model_filled_code, core_line_in_core_block, core_line_in_output_clear): + + c = 0 + n = len(model_output) + for index, code in enumerate(model_output): + if re.search(rf'\b{re.escape(answer)}\b', code) and is_code_valid(model_filled_code[index]) and is_correct_parameter_count(answer, core_line_in_core_block, core_line_in_output_clear[index]) and with_correct(core_line_in_core_block, core_line_in_output_clear[index]) and check_keyword_parameters(answer, core_line_in_core_block, core_line_in_output_clear[index]):#block + c += 1 + if n-c < k: + return 1.0 + + score = 1 - (math.comb(n - c, k))/(math.comb(n, k)) + + return score + +def compute_score_k(answer:str, model_output:list, k:int): + + c = 0 + n = len(model_output) + for index, code in enumerate(model_output): + if re.search(rf'\b{re.escape(answer)}\b', code) and is_code_valid(code):#block + # if re.search(rf'\b{re.escape(answer)}\b', code):#line + c += 1 + if n-c < k: + return 1.0 + + score = 1 - (math.comb(n - c, k))/(math.comb(n, k)) + + return score + +k = 3 #cdc@k +task = 'block' # line or block +json_name = f"Versicode_{task}_completion.json" + +folder_path = f'../data/result_data/{task}_completion' +model_list = os.listdir(folder_path) + +for model in model_list: + model_json_path = os.path.join(folder_path, model, json_name) + with open(model_json_path, 'r', encoding='utf-8')as fr: + lodict = json.load(fr) + data_list = lodict + + if task == 'line': + score_list = [] + for data in data_list: + answer = data['core_token'] + model_output = eval(data['model_output_clear']) + model_filled_code = [data['masked_code'].replace('', i) for i in model_output] + core_line = data['core_line'] + score_list.append(compute_line_score_k(answer, model_output, k, model_filled_code, core_line)) + else: + score_list = [] + for data in data_list: + answer = data['core_token'] + model_output = eval(data['model_output_clear']) + model_filled_code = eval(data['model_output_clear']) + core_line = data['core_line'] + core_line_in_output_clear = data['core_line_in_output_clear'] + score_list.append(compute_block_score_k(answer, model_output, k, model_filled_code, core_line, core_line_in_output_clear)) + + final_score = sum(score_list)/len(score_list) + print(f"{model}, {task} completion task, cdc@{k} score: {final_score}") diff --git a/evaluation/benchmarks/versicode/metric/compute_versicode_em_score.py b/evaluation/benchmarks/versicode/metric/compute_versicode_em_score.py new file mode 100644 index 0000000000..bb3a363fad --- /dev/null +++ b/evaluation/benchmarks/versicode/metric/compute_versicode_em_score.py @@ -0,0 +1,175 @@ +""" +Calculate the cdc score for line and block +""" +import os +import json +import math +import re +import warnings +# warnings.filterwarnings("ignore", category=SyntaxWarning) + +def is_code_valid(code): + + try: + compile(code, '', 'exec') + return True + except: + return False + +def is_correct_parameter_count(function_name, correct_code, test_code): + """ + 判断参数数量是否一致 + :param function_name: + :param correct_code: + :param test_code: + :return: + """ + # 获取正确代码中的参数数量 + # return True + pattern = rf'{function_name}\((.*?)\)' + correct_match = re.search(pattern, correct_code) + + if correct_match: + correct_params = correct_match.group(1).strip() + correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()] + expected_count = len(correct_param_list) + else: + expected_count = 0 # 如果没有参数,期望数量为0 + + # 在需要判断的代码中查找函数调用 + test_match = re.search(pattern, test_code) + + if test_match: + test_params = test_match.group(1).strip() + test_param_list = [p.strip() for p in test_params.split(',') if p.strip()] + return len(test_param_list) == expected_count # 检查参数数量 + else: + # 如果没有括号,检查函数名是否在字符串中 + return expected_count == 0 and function_name in test_code + +def check_keyword_parameters(function_name, correct_code, test_code): + """ + 判断关键词参数赋值是否正确使用 + :param function_name: + :param correct_code: + :param test_code: + :return: + """ + # 正则表达式匹配正确代码中的函数调用 + # return True + pattern = rf'{function_name}\((.*?)\)' + correct_match = re.search(pattern, correct_code) + + if correct_match: + correct_params = correct_match.group(1).strip() + correct_param_list = [p.strip() for p in correct_params.split(',') if p.strip()] + + # 检查待检测代码中的函数调用 + test_match = re.search(pattern, test_code) + + if test_match: + test_params = test_match.group(1).strip() + test_param_list = [p.strip() for p in test_params.split(',') if p.strip()] + + # 确保待检测的每个参数都以关键字参数形式赋值 + for correct_param in correct_param_list: + if '=' in correct_param: # 仅当正确代码中有关键词参数 + param_name = correct_param.split('=')[0].strip() + if not any(param_name in test_param and '=' in test_param for test_param in test_param_list): + return False # 如果对应参数不是关键词参数,则返回False + + return True # 所有关键字参数匹配 + + return False # 如果没有匹配,返回False + +def with_correct(answer_code:str, model_output:str)->bool: + """ + 当answer是with结构时,判断模型生成的是不是with结构 + :param answer_code: + :param model_output: + :return: + """ + # return True + if not answer_code.startswith('with') and not model_output.startswith('with'): + return True + elif answer_code.startswith('with') and model_output.startswith('with'): + return True + else: + return False + +def compute_line_score_k(answer:str, model_output:list, k:int, model_filled_code, core_line): + + c = 0 + n = len(model_output) + for index, code in enumerate(model_output): + if re.search(rf'\b{re.escape(answer)}\b', code):#line + c += 1 + if n-c < k: + return 1.0 + + score = 1 - (math.comb(n - c, k))/(math.comb(n, k)) + + return score + +def compute_block_score_k(answer:str, model_output:list, k:int, model_filled_code, core_line_in_core_block, core_line_in_output_clear): + + c = 0 + n = len(model_output) + for index, code in enumerate(model_output): + if re.search(rf'\b{re.escape(answer)}\b', code):#block + c += 1 + if n-c < k: + return 1.0 + + score = 1 - (math.comb(n - c, k))/(math.comb(n, k)) + + return score + +def compute_score_k(answer:str, model_output:list, k:int): + + c = 0 + n = len(model_output) + for index, code in enumerate(model_output): + if re.search(rf'\b{re.escape(answer)}\b', code) and is_code_valid(code):#block + # if re.search(rf'\b{re.escape(answer)}\b', code):#line + c += 1 + if n-c < k: + return 1.0 + + score = 1 - (math.comb(n - c, k))/(math.comb(n, k)) + + return score + +k = 3 #em@k +task = 'block' # line or block +json_name = f"Versicode_{task}_completion.json" + +folder_path = f'../data/result_data/{task}_completion' +model_list = os.listdir(folder_path) + +for model in model_list: + model_json_path = os.path.join(folder_path, model, json_name) + with open(model_json_path, 'r', encoding='utf-8')as fr: + lodict = json.load(fr) + data_list = lodict + + if task == 'line': + score_list = [] + for data in data_list: + answer = data['core_token'] + model_output = eval(data['model_output_clear']) + model_filled_code = [data['masked_code'].replace('', i) for i in model_output] + core_line = data['core_line'] + score_list.append(compute_line_score_k(answer, model_output, k, model_filled_code, core_line)) + else: + score_list = [] + for data in data_list: + answer = data['core_token'] + model_output = eval(data['model_output_clear']) + model_filled_code = eval(data['model_output_clear']) + core_line = data['core_line'] + core_line_in_output_clear = data['core_line_in_output_clear'] + score_list.append(compute_block_score_k(answer, model_output, k, model_filled_code, core_line, core_line_in_output_clear)) + + final_score = sum(score_list)/len(score_list) + print(f"{model}, {task} completion task, em@{k} score: {final_score}") diff --git a/evaluation/benchmarks/versicode/output_processing/choose_core_line_from_block_versicode.py b/evaluation/benchmarks/versicode/output_processing/choose_core_line_from_block_versicode.py new file mode 100644 index 0000000000..78632625a5 --- /dev/null +++ b/evaluation/benchmarks/versicode/output_processing/choose_core_line_from_block_versicode.py @@ -0,0 +1,107 @@ +""" +Find the line of code generated by the model using the block in the version code +""" +import os +import re +import json +import random + +def process_line_mask(code_snippet, core_token): + if not core_token: + + return None, None + + + replaced_lines = {} + lines = code_snippet.split("\n") + + + in_multi_line_comment = False + + + for i, line in enumerate(lines): + if in_multi_line_comment: + + if ('"""' in line or "'''" in line) and not re.findall(r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line): + in_multi_line_comment = False + continue + elif line.strip().startswith("#"): + + continue + elif re.findall(r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line): + + continue + elif ('"""' in line or "'''" in line) and not re.findall(r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line): + + in_multi_line_comment = True + continue + else: + + if re.search(r'\bdef\s+task_function\b', line): + continue + + + if re.search(r'\b{}\b(?!\s*=)'.format(re.escape(core_token)), line): + + replaced_lines.update({i: line}) + + if replaced_lines: + random_line_location = random.choice(list(replaced_lines.keys())) + + masked_line = lines[random_line_location] + leading_spaces = re.match(r'^\s*', masked_line).group(0) + masked_line = masked_line.strip() + lines[random_line_location] = leading_spaces + "" + + masked_code = '\n'.join(lines) + + return masked_code, masked_line + + return None, None + + +def load_json(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + return data + + +def save_json(file_path, data): + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + + +if __name__ == "__main__": + model_list = os.listdir('../data/result_data/block_completion') + for model in model_list: + + input_json_file = f'../data/result_data/block_completion/{model}/VersiCode_block_completion.json' + output_json_file = input_json_file + data = load_json(input_json_file) + + for item in data: + core_token = item['core_token'] + code = item['code'] + + _, core_line_in_code = process_line_mask(code, core_token) + if core_line_in_code: + item['core_line_in_code'] = core_line_in_code + else: + item['core_line_in_code'] = "N/A" + + model_output_clear = item['model_output_clear'] + core_line_in_output_list = [] + + for entry in eval(model_output_clear): + _, core_line_in_output = process_line_mask(entry, core_token) + if core_line_in_output: + core_line_in_output_list.append(core_line_in_output) + else: + core_line_in_output_list.append("N/A") + + item['core_line_in_output_clear'] = core_line_in_output_list + + save_json(output_json_file, data) + print("Done!") + diff --git a/evaluation/benchmarks/versicode/output_processing/choose_core_line_from_migration_versicode.py b/evaluation/benchmarks/versicode/output_processing/choose_core_line_from_migration_versicode.py new file mode 100644 index 0000000000..bc730fed86 --- /dev/null +++ b/evaluation/benchmarks/versicode/output_processing/choose_core_line_from_migration_versicode.py @@ -0,0 +1,108 @@ +""" +Find the line of code generated by the model using the block in the version code +""" +import os +import re +import json +import random + +def process_line_mask(code_snippet, core_token): + if not core_token: + + return None, None + + + replaced_lines = {} + lines = code_snippet.split("\n") + + + in_multi_line_comment = False + + + for i, line in enumerate(lines): + if in_multi_line_comment: + + if ('"""' in line or "'''" in line) and not re.findall(r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line): + in_multi_line_comment = False + continue + elif line.strip().startswith("#"): + + continue + elif re.findall(r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line): + + continue + elif ('"""' in line or "'''" in line) and not re.findall(r"'''(.*?)'''|\"\"\"(.*?)\"\"\"", line): + + in_multi_line_comment = True + continue + else: + + if re.search(r'\bdef\s+task_function\b', line): + continue + + + if re.search(r'\b{}\b(?!\s*=)'.format(re.escape(core_token)), line): + + replaced_lines.update({i: line}) + + if replaced_lines: + random_line_location = random.choice(list(replaced_lines.keys())) + + masked_line = lines[random_line_location] + leading_spaces = re.match(r'^\s*', masked_line).group(0) + masked_line = masked_line.strip() + lines[random_line_location] = leading_spaces + "" + + masked_code = '\n'.join(lines) + + return masked_code, masked_line + + return None, None + + +def load_json(file_path): + with open(file_path, 'r', encoding='utf-8') as f: + data = json.load(f) + return data + + +def save_json(file_path, data): + with open(file_path, 'w', encoding='utf-8') as f: + json.dump(data, f, ensure_ascii=False, indent=4) + + + +if __name__ == "__main__": + model_list = os.listdir('../data/result_data/code_migration') + for model in model_list: + + input_json_file = f'../data/result_data/code_migration/{model}/VersiCode_migration.json' + output_json_file = input_json_file + data = load_json(input_json_file) + + for item in data: + core_token = item['old_name'] + code = item['old_code'] + + _, core_line_in_code = process_line_mask(code, core_token) + if core_line_in_code: + item['core_line_in_code'] = core_line_in_code + else: + item['core_line_in_code'] = "N/A" + + model_output_clear = item['model_output_clear'] + core_line_in_output_list = [] + + core_token = item['new_name'] + for entry in eval(model_output_clear): + _, core_line_in_output = process_line_mask(entry, core_token) + if core_line_in_output: + core_line_in_output_list.append(core_line_in_output) + else: + core_line_in_output_list.append("N/A") + + item['core_line_in_output_clear'] = core_line_in_output_list + + save_json(output_json_file, data) + print("Done!") + diff --git a/evaluation/benchmarks/versicode/output_processing/clear_ans.py b/evaluation/benchmarks/versicode/output_processing/clear_ans.py new file mode 100644 index 0000000000..b6d72c1ac0 --- /dev/null +++ b/evaluation/benchmarks/versicode/output_processing/clear_ans.py @@ -0,0 +1,36 @@ +""" +Clear theandgenerated by the model in inference +""" + +import json +import os + +model_name = '' +task = 'block_completion' + +result_path = f'../data/result_data/{task}/{model_name}/VersiCode_block_completion.json' #Modify the file according to the task format + + +with open(result_path, 'r', encoding='utf-8')as fr: + lodict = json.load(fr) +data_dict = lodict +data_list = data_dict + +for data in data_list: + temp_list = [] + model_output_list = eval(data['model_output']) + for output in model_output_list: + + if "" in output and "" in output: + start_index = output.find("") + len("") + end_index = output.find("") + content = output[start_index:end_index].replace('```python', '').replace('```', '') + else: + content = "no_answer" + + temp_list.append(content) + + data['model_output_clear'] = str(temp_list) + +with open(result_path, 'w', encoding='utf-8')as fw: + json.dump(data_dict, fw, indent=4, ensure_ascii=False) \ No newline at end of file diff --git a/evaluation/benchmarks/versicode/requirements.txt b/evaluation/benchmarks/versicode/requirements.txt new file mode 100644 index 0000000000..02ba5f6fb7 --- /dev/null +++ b/evaluation/benchmarks/versicode/requirements.txt @@ -0,0 +1,146 @@ +aiohappyeyeballs==2.6.1 +aiohttp==3.11.18 +aiosignal==1.3.2 +airportsdata==20250224 +annotated-types==0.7.0 +anyio==4.9.0 +astor==0.8.1 +attrs==25.3.0 +blake3==1.0.4 +cachetools==5.5.2 +certifi==2025.1.31 +charset-normalizer==3.4.1 +click==8.1.8 +cloudpickle==3.1.1 +compressed-tensors==0.9.3 +cupy-cuda12x==13.4.1 +Deprecated==1.2.18 +depyf==0.18.0 +dill==0.4.0 +diskcache==5.6.3 +distro==1.9.0 +dnspython==2.7.0 +einops==0.8.1 +email_validator==2.2.0 +fastapi==0.115.12 +fastapi-cli==0.0.7 +fastrlock==0.8.3 +filelock==3.18.0 +frozenlist==1.6.0 +fsspec==2025.3.2 +gguf==0.16.2 +googleapis-common-protos==1.70.0 +grpcio==1.71.0 +h11==0.14.0 +hf-xet==1.0.3 +httpcore==1.0.8 +httptools==0.6.4 +httpx==0.28.1 +huggingface-hub==0.30.2 +idna==3.10 +importlib_metadata==8.0.0 +interegular==0.3.3 +Jinja2==3.1.6 +jiter==0.9.0 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +lark==1.2.2 +llguidance==0.7.16 +llvmlite==0.44.0 +lm-format-enforcer==0.10.11 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +mdurl==0.1.2 +mistral_common==1.5.4 +mpmath==1.3.0 +msgpack==1.1.0 +msgspec==0.19.0 +multidict==6.4.3 +nest-asyncio==1.6.0 +networkx==3.4.2 +ninja==1.11.1.4 +numba==0.61.2 +numpy==2.2.5 +nvidia-cublas-cu12==12.4.5.8 +nvidia-cuda-cupti-cu12==12.4.127 +nvidia-cuda-nvrtc-cu12==12.4.127 +nvidia-cuda-runtime-cu12==12.4.127 +nvidia-cudnn-cu12==9.1.0.70 +nvidia-cufft-cu12==11.2.1.3 +nvidia-curand-cu12==10.3.5.147 +nvidia-cusolver-cu12==11.6.1.9 +nvidia-cusparse-cu12==12.3.1.170 +nvidia-cusparselt-cu12==0.6.2 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.4.127 +nvidia-nvtx-cu12==12.4.127 +openai==1.75.0 +opencv-python-headless==4.11.0.86 +opentelemetry-api==1.26.0 +opentelemetry-exporter-otlp==1.26.0 +opentelemetry-exporter-otlp-proto-common==1.26.0 +opentelemetry-exporter-otlp-proto-grpc==1.26.0 +opentelemetry-exporter-otlp-proto-http==1.26.0 +opentelemetry-proto==1.26.0 +opentelemetry-sdk==1.26.0 +opentelemetry-semantic-conventions==0.47b0 +opentelemetry-semantic-conventions-ai==0.4.3 +outlines==0.1.11 +outlines_core==0.1.26 +packaging==25.0 +partial-json-parser==0.2.1.1.post5 +pillow==11.2.1 +prometheus-fastapi-instrumentator==7.1.0 +prometheus_client==0.21.1 +propcache==0.3.1 +protobuf==4.25.6 +psutil==7.0.0 +py-cpuinfo==9.0.0 +pycountry==24.6.1 +pydantic==2.11.3 +pydantic_core==2.33.1 +Pygments==2.19.1 +python-dotenv==1.1.0 +python-json-logger==3.3.0 +python-multipart==0.0.20 +PyYAML==6.0.2 +pyzmq==26.4.0 +ray==2.43.0 +referencing==0.36.2 +regex==2024.11.6 +requests==2.32.3 +rich==14.0.0 +rich-toolkit==0.14.1 +rpds-py==0.24.0 +safetensors==0.5.3 +scipy==1.15.2 +sentencepiece==0.2.0 +setuptools==75.8.0 +shellingham==1.5.4 +six==1.17.0 +sniffio==1.3.1 +starlette==0.46.2 +sympy==1.13.1 +tiktoken==0.9.0 +tokenizers==0.21.1 +torch==2.6.0 +torchaudio==2.6.0 +torchvision==0.21.0 +tqdm==4.67.1 +transformers==4.51.3 +triton==3.2.0 +typer==0.15.2 +typing-inspection==0.4.0 +typing_extensions==4.13.2 +urllib3==2.4.0 +uvicorn==0.34.2 +uvloop==0.21.0 +vllm==0.8.4 +watchfiles==1.0.5 +websockets==15.0.1 +wheel==0.45.1 +wrapt==1.17.2 +xformers==0.0.29.post2 +xgrammar==0.1.18 +yarl==1.20.0 +zipp==3.21.0