NLP：Python计算句子概率（最大似然估计法，加一平滑，困惑度，随机句子生成）

海绵大星星

已于 2022-07-05 10:56:56 修改

阅读量1.8k

点赞数

分类专栏： Python NLP 文章标签：自然语言处理 python 人工智能

于 2022-07-05 10:50:43 首次发布

本文链接：https://blog.csdn.net/qq_43552032/article/details/125615273

版权

Python 同时被 2 个专栏收录

7 篇文章 0 订阅

订阅专栏

NLP

3 篇文章 1 订阅

订阅专栏

def unigram_MLE(train_corpus, test_corpus):
    """最大似然估计法估算一元语法概率,简单地将每个单词出现的频率相乘（P（w1w2w3）=p(w1)*p(w2)*p(w3)），返回该频率"""
    train_tokens = preprocess( train_corpus )
    test_tokens = preprocess( test_corpus )
    # 替换未知单词
    train_tokens, test_tokens = sub_unknown( train_tokens, test_tokens )
    print( train_tokens )
    print( test_tokens )
    # 训练集的一元语法模型
    train_unigram = unigrams( train_tokens )

    Prob_MLE = 1
    for word in test_tokens:
        Prob_MLE *= train_unigram[word] / len( train_tokens )
    return Prob_MLE


def bigram_MLE(corpus, before, after):
    """最大似然估计法估算二元语法概率，传入语料库、起始单词、后续单词，返回该二元语法的概率(若在训练集中查询不到则直接返回None)"""
    # P(wn|wn−1) = C(wn−1:wn)/ C(wn−1)
    tokens = preprocess( corpus )
    unigram = unigrams( tokens )
    bigram = bigrams( tokens )
    if unigram.get( before.lower() ) is None or bigram.get( before.lower() + " " + after.lower() ) is None:
        return None
    Counts_together = bigram[before.lower() + " " + after.lower()]
    Counts_before = unigram[before.lower()]
    print( "count of '" + before + ' ' + after + "':" + str( Counts_together ) )
    print( "count of '" + before + "':" + str( Counts_before ) )
    return Counts_together / Counts_before


def bigram_Laplace(train_corpus, test_corpus):
    """带加一平滑的最大似然估计发估算二元语法概率，传入训练语料库、测试集，返回测试集的的对数概率"""
    # PLaplace(wn|wn−1) = C(wn−1wn)+1 / C(wn−1)+V  (V是训练集中单词种类数量)
    # 对数的加法为ln(MN)=ln(M)+ln(N)
    # 计算得到的对数概率如果需要转换为原始概率，只需要使用指数函数exp（）对结果进行转换
    train_tokens = preprocess( train_corpus )
    test_tokens = preprocess( test_corpus )
    # 替换未知单词
    train_tokens, test_tokens = sub_unknown( train_tokens, test_tokens )
    # 训练集的unigram和bigram
    train_unigram = unigrams( train_tokens )
    train_bigram = bigrams( train_tokens )
    # 测试集的unigram和bigram
    test_unigram = unigrams( test_tokens )
    test_bigram = bigrams( test_tokens )
    Prob_log = 0
    for bigram in test_bigram.keys():
        before, after = bigram.split()
        # 使用加一平滑进行数据平滑，防止出现局部二元语法概率为0而导致整体估算失误的情况
        Counts_together = (train_bigram[before.lower() + " " + after.lower()] if train_bigram.get(
            before.lower() + " " + after.lower() ) is not None else 0) + 1
        # 分母需要加上训练集中的单词种类个数，以中和所有分子的变动（在原来的稀疏矩阵中，存在与否的cell计数都加1）
        Counts_before = train_unigram[before.lower()] + len( train_unigram )
        Prob_laplace = Counts_together / Counts_before
        Prob_log += math.log( Prob_laplace )
    return Prob_log


def unigram_PP(train_corpus, test_corpus):
    #  PP(W) = pow(P(w1w2...wN),-(1/N))
    # P（w1w2w3）=p(w1)*p(w2)*p(w3)
    test_tokens = preprocess( test_corpus )
    Prob_MLE = unigram_MLE( train_corpus, test_corpus )
    return math.pow( Prob_MLE, -(1 / len( test_tokens )) )


def bigram_PP(train_corpus, test_corpus):
    # 困惑度 （困惑度越大说明测试集概率越小，说明模型越不理想）
    # PP(W) = pow(P(w1w2...wN),-(1/N)) (N是测试集的长度)
    test_tokens = preprocess( test_corpus )
    Prob_log = bigram_Laplace( train_corpus, test_corpus )  # 测试集的对数概率
    Prob_MLE = math.exp( Prob_log )  # 测试集的原始概率
    return math.pow( Prob_MLE, -(1 / len( test_tokens )) )

def bigramToCfd(bigrams):
    """传入{'<s> poems': 1, 'poems by': 1}形式的字典，返回{“I”:[（“am”，3）,("was",2)]}形式的字典"""
    #  原始['<s> i', 'i am', 'am happy', 'happy to']→[('[', 'Poems'), ('Poems', 'by')]
    cfd_dict = {}
    for bigram, num in bigrams.items():
        # print( bigram, num )
        before, after = bigram.split()
        #  采用字典形式： {“I”:[（“am”，3）,("was",2)]}
        if before not in cfd_dict:
            cfd_dict[before] = [(after, num)]  # 初始化列表{“I”:[（“am”，3）]}
        else:
            cfd_dict[before].append( (after, num) )  # 增加该条件下的其余样本
    return cfd_dict


def generate_sentence(cfdist, word, num=10):
    """传入条件概率分布对象（ConditionalFreqDist）、初始词汇、生成句子长度（默认10个词），返回生成的随机句子（字符串）"""
    sentence = """"""
    # 若该条件下样本总数为0，则说明该单词不适合作为开始词
    if cfdist[word].N() == 0:
        print( "Please change a begin word." )
        return None

    for i in range( num ):
        sentence = sentence + word + " "
        # 当采用最大值策略时，经常出现局部循环现象，例如life , And the night , And the night , And the
        # word = cfdist[word].max()
        # 采用随机样本，将该条件下频次最高的前三个样本取出，并随机抽取其中一个，返回值为一个元组（单词，频次），最后用0下标取出单词
        word = random.choice( cfdist[word].most_common( 3 ) )[0]
    return sentence


def random_sentence(cfd_dict, word, num=10):
    """传入二元组分布字典、初始词汇、生成句子长度（默认10个词），返回生成的随机句子（字符串）"""
    sentence = """"""
    if cfd_dict[word] is None:
        print( "Please change a begin word." )
        return None
    word = word.lower()
    for i in range( num ):
        sentence = sentence + word + " "
        # 采用随机样本，将该条件下频次最高的前三个样本取出，并随机抽取其中一个，返回值为一个元组（单词，频次），最后用0下标取出单词
        temp_list = sorted( cfd_dict[word], key=lambda x: x[1], reverse=True )
        word = random.choice( temp_list[:3] )[0]
        # 若抽取到结束符则重新抽取，因为设计的二元组中没有以</s>作为起始的
        while word == "</s>":
            word = random.choice( temp_list[:3] )[0]
    return sentence