【python】读取xlsx文件,并测试BLEU-1,BLEU-2,BLEU-3,BLEU-4

import pandas as pd
import nltk
import jieba
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# 下载所需的nltk数据
nltk.download('punkt')

def preprocess_text(text):
    # 使用 jieba 进行中文分词
    words = jieba.lcut(text)
    return words

def calculate_bleu_scores(reference, candidate):
    # 分词
    reference_tokens = preprocess_text(reference)
    candidate_tokens = preprocess_text(candidate)
    
    smoothing_function = SmoothingFunction().method1
    
    # 计算BLEU-1到BLEU-4
    bleu_scores = {
        'BLEU-1': sentence_bleu([reference_tokens], candidate_tokens, weights=(1, 0, 0, 0), smoothing_function=smoothing_function),
        'BLEU-2': sentence_bleu([reference_tokens], candidate_tokens, weights=(0.5, 0.5, 0, 0), smoothing_function=smoothing_function),
        'BLEU-3': sentence_bleu([reference_tokens], candidate_tokens, weights=(0.33, 0.33, 0.33, 0), smoothing_function=smoothing_function),
        'BLEU-4': sentence_bleu([reference_tokens], candidate_tokens, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=smoothing_function)
    }
    
    return bleu_scores

def main():
    # 读取xlsx文件
    df = pd.read_excel('读取文件路径')

    # 假设数据有一列 '参考答案' 和五列 'candidate1', 'candidate2', 'candidate3', 'candidate4', 'candidate5'
    references = df['参考答案']
    candidate_columns = ['A', 'B', 'C', 'AB', 'AC', 'BC']

    # 计算每个reference与每个candidate的BLEU指标
    for candidate in candidate_columns:
        bleu_1_scores = []
        bleu_2_scores = []
        bleu_3_scores = []
        bleu_4_scores = []
        
        for ref, cand in zip(references, df[candidate]):
            bleu_scores = calculate_bleu_scores(ref, cand)
            bleu_1_scores.append(bleu_scores['BLEU-1'])
            bleu_2_scores.append(bleu_scores['BLEU-2'])
            bleu_3_scores.append(bleu_scores['BLEU-3'])
            bleu_4_scores.append(bleu_scores['BLEU-4'])
        
        # 将BLEU指标添加到DataFrame中
        df[f'BLEU-1_{candidate}'] = bleu_1_scores
        df[f'BLEU-2_{candidate}'] = bleu_2_scores
        df[f'BLEU-3_{candidate}'] = bleu_3_scores
        df[f'BLEU-4_{candidate}'] = bleu_4_scores

    # 保存结果到新的xlsx文件
    df.to_excel('保存路径', index=False)

if __name__ == "__main__":
    main()

  • 4
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值