【python】读取xlsx文件，并测试BLEU-1，BLEU-2，BLEU-3，BLEU-4

USTB-zmh

于 2024-08-13 11:14:33 发布

阅读量112

点赞数 4

文章标签： python BLEU NLP

本文链接：https://blog.csdn.net/weixin_44729515/article/details/141159157

版权

import pandas as pd
import nltk
import jieba
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# 下载所需的nltk数据
nltk.download('punkt')

def preprocess_text(text):
    # 使用 jieba 进行中文分词
    words = jieba.lcut(text)
    return words

def calculate_bleu_scores(reference, candidate):
    # 分词
    reference_tokens = preprocess_text(reference)
    candidate_tokens = preprocess_text(candidate)
    
    smoothing_function = SmoothingFunction().method1
    
    # 计算BLEU-1到BLEU-4
    bleu_scores = {
        'BLEU-1': sentence_bleu([reference_tokens], candidate_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothing_function),
        'BLEU-2': sentence_bleu([reference_tokens], candidate_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function),
        'BLEU-3': sentence_bleu([reference_tokens], candidate_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing_function),
        'BLEU-4': sentence_bleu([reference_tokens], candidate_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function)
    }
    
    return bleu_scores

def main():
    # 读取xlsx文件
    df = pd.read_excel('读取文件路径')

    # 假设数据有一列 '参考答案' 和五列 'candidate1', 'candidate2', 'candidate3', 'candidate4', 'candidate5'
    references = df['参考答案']
    candidate_columns = ['A', 'B', 'C', 'AB', 'AC', 'BC']

    # 计算每个reference与每个candidate的BLEU指标
    for candidate in candidate_columns:
        bleu_1_scores = []
        bleu_2_scores = []
        bleu_3_scores = []
        bleu_4_scores = []
        
        for ref, cand in zip(references, df[candidate]):
            bleu_scores = calculate_bleu_scores(ref, cand)
            bleu_1_scores.append(bleu_scores['BLEU-1'])
            bleu_2_scores.append(bleu_scores['BLEU-2'])
            bleu_3_scores.append(bleu_scores['BLEU-3'])
            bleu_4_scores.append(bleu_scores['BLEU-4'])
        
        # 将BLEU指标添加到DataFrame中
        df[f'BLEU-1_{candidate}'] = bleu_1_scores
        df[f'BLEU-2_{candidate}'] = bleu_2_scores
        df[f'BLEU-3_{candidate}'] = bleu_3_scores
        df[f'BLEU-4_{candidate}'] = bleu_4_scores

    # 保存结果到新的xlsx文件
    df.to_excel('保存路径', index=False)

if __name__ == "__main__":
    main()