import pandas as pd
import nltk
import jieba
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
# 下载所需的nltk数据
nltk.download('punkt')
def preprocess_text(text):
# 使用 jieba 进行中文分词
words = jieba.lcut(text)
return words
def calculate_bleu_scores(reference, candidate):
# 分词
reference_tokens = preprocess_text(reference)
candidate_tokens = preprocess_text(candidate)
smoothing_function = SmoothingFunction().method1
# 计算BLEU-1到BLEU-4
bleu_scores = {
'BLEU-1': sentence_bleu([reference_tokens], candidate_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothing_function),
'BLEU-2': sentence_bleu([reference_tokens], candidate_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function),
'BLEU-3': sentence_bleu([reference_tokens], candidate_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing_function),
'BLEU-4': sentence_bleu([reference_tokens], candidate_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function)
}
return bleu_scores
def main():
# 读取xlsx文件
df = pd.read_excel('读取文件路径')
# 假设数据有一列 '参考答案' 和五列 'candidate1', 'candidate2', 'candidate3', 'candidate4', 'candidate5'
references = df['参考答案']
candidate_columns = ['A', 'B', 'C', 'AB', 'AC', 'BC']
# 计算每个reference与每个candidate的BLEU指标
for candidate in candidate_columns:
bleu_1_scores = []
bleu_2_scores = []
bleu_3_scores = []
bleu_4_scores = []
for ref, cand in zip(references, df[candidate]):
bleu_scores = calculate_bleu_scores(ref, cand)
bleu_1_scores.append(bleu_scores['BLEU-1'])
bleu_2_scores.append(bleu_scores['BLEU-2'])
bleu_3_scores.append(bleu_scores['BLEU-3'])
bleu_4_scores.append(bleu_scores['BLEU-4'])
# 将BLEU指标添加到DataFrame中
df[f'BLEU-1_{candidate}'] = bleu_1_scores
df[f'BLEU-2_{candidate}'] = bleu_2_scores
df[f'BLEU-3_{candidate}'] = bleu_3_scores
df[f'BLEU-4_{candidate}'] = bleu_4_scores
# 保存结果到新的xlsx文件
df.to_excel('保存路径', index=False)
if __name__ == "__main__":
main()
06-10
834
08-10
449
10-22