NLP:Python计算句子概率(最大似然估计法,加一平滑,困惑度,随机句子生成)

def unigram_MLE(train_corpus, test_corpus):
    """最大似然估计法估算一元语法概率,简单地将每个单词出现的频率相乘(P(w1w2w3)=p(w1)*p(w2)*p(w3)),返回该频率"""
    train_tokens = preprocess( train_corpus )
    test_tokens = preprocess( test_corpus )
    # 替换未知单词
    train_tokens, test_tokens = sub_unknown( train_tokens, test_tokens )
    print( train_tokens )
    print( test_tokens )
    # 训练集的一元语法模型
    train_unigram = unigrams( train_tokens )

    Prob_MLE = 1
    for word in test_tokens:
        Prob_MLE *= train_unigram[word] / len( train_tokens )
    return Prob_MLE


def bigram_MLE(corpus, before, after):
    """最大似然估计法估算二元语法概率,传入语料库、起始单词、后续单词,返回该二元语法的概率(若在训练集中查询不到则直接返回None)"""
    # P(wn|wn−1) = C(wn−1:wn)/ C(wn−1)
    tokens = preprocess( corpus )
    unigram = unigrams( tokens )
    bigram = bigrams( tokens )
    if unigram.get( before.lower() ) is None or bigram.get( before.lower() + " " + after.lower() ) is None:
        return None
    Counts_together = bigram[before.lower() + " " + after.lower()]
    Counts_before = unigram[before.lower()]
    print( "count of '" + before + ' ' + after + "':" + str( Counts_together ) )
    print( "count of '" + before + "':" + str( Counts_before ) )
    return Counts_together / Counts_before


def bigram_Laplace(train_corpus, test_corpus):
    """带加一平滑的最大似然估计发估算二元语法概率,传入训练语料库、测试集,返回测试集的的对数概率"""
    # PLaplace(wn|wn−1) = C(wn−1wn)+1 / C(wn−1)+V  (V是训练集中单词种类数量)
    # 对数的加法为ln(MN)=ln(M)+ln(N)
    # 计算得到的对数概率如果需要转换为原始概率,只需要使用指数函数exp()对结果进行转换
    train_tokens = preprocess( train_corpus )
    test_tokens = preprocess( test_corpus )
    # 替换未知单词
    train_tokens, test_tokens = sub_unknown( train_tokens, test_tokens )
    # 训练集的unigram和bigram
    train_unigram = unigrams( train_tokens )
    train_bigram = bigrams( train_tokens )
    # 测试集的unigram和bigram
    test_unigram = unigrams( test_tokens )
    test_bigram = bigrams( test_tokens )
    Prob_log = 0
    for bigram in test_bigram.keys():
        before, after = bigram.split()
        # 使用加一平滑进行数据平滑,防止出现局部二元语法概率为0而导致整体估算失误的情况
        Counts_together = (train_bigram[before.lower() + " " + after.lower()] if train_bigram.get(
            before.lower() + " " + after.lower() ) is not None else 0) + 1
        # 分母需要加上训练集中的单词种类个数,以中和所有分子的变动(在原来的稀疏矩阵中,存在与否的cell计数都加1)
        Counts_before = train_unigram[before.lower()] + len( train_unigram )
        Prob_laplace = Counts_together / Counts_before
        Prob_log += math.log( Prob_laplace )
    return Prob_log


def unigram_PP(train_corpus, test_corpus):
    #  PP(W) = pow(P(w1w2...wN),-(1/N))
    # P(w1w2w3)=p(w1)*p(w2)*p(w3)
    test_tokens = preprocess( test_corpus )
    Prob_MLE = unigram_MLE( train_corpus, test_corpus )
    return math.pow( Prob_MLE, -(1 / len( test_tokens )) )


def bigram_PP(train_corpus, test_corpus):
    # 困惑度 (困惑度越大说明测试集概率越小,说明模型越不理想)
    # PP(W) = pow(P(w1w2...wN),-(1/N)) (N是测试集的长度)
    test_tokens = preprocess( test_corpus )
    Prob_log = bigram_Laplace( train_corpus, test_corpus )  # 测试集的对数概率
    Prob_MLE = math.exp( Prob_log )  # 测试集的原始概率
    return math.pow( Prob_MLE, -(1 / len( test_tokens )) )

def bigramToCfd(bigrams):
    """传入{'<s> poems': 1, 'poems by': 1}形式的字典,返回{“I”:[(“am”,3),("was",2)]}形式的字典"""
    #  原始['<s> i', 'i am', 'am happy', 'happy to']→[('[', 'Poems'), ('Poems', 'by')]
    cfd_dict = {}
    for bigram, num in bigrams.items():
        # print( bigram, num )
        before, after = bigram.split()
        #  采用字典形式: {“I”:[(“am”,3),("was",2)]}
        if before not in cfd_dict:
            cfd_dict[before] = [(after, num)]  # 初始化列表{“I”:[(“am”,3)]}
        else:
            cfd_dict[before].append( (after, num) )  # 增加该条件下的其余样本
    return cfd_dict


def generate_sentence(cfdist, word, num=10):
    """传入条件概率分布对象(ConditionalFreqDist)、初始词汇、生成句子长度(默认10个词),返回生成的随机句子(字符串)"""
    sentence = """"""
    # 若该条件下样本总数为0,则说明该单词不适合作为开始词
    if cfdist[word].N() == 0:
        print( "Please change a begin word." )
        return None

    for i in range( num ):
        sentence = sentence + word + " "
        # 当采用最大值策略时,经常出现局部循环现象,例如life , And the night , And the night , And the
        # word = cfdist[word].max()
        # 采用随机样本,将该条件下频次最高的前三个样本取出,并随机抽取其中一个,返回值为一个元组(单词,频次),最后用0下标取出单词
        word = random.choice( cfdist[word].most_common( 3 ) )[0]
    return sentence


def random_sentence(cfd_dict, word, num=10):
    """传入二元组分布字典、初始词汇、生成句子长度(默认10个词),返回生成的随机句子(字符串)"""
    sentence = """"""
    if cfd_dict[word] is None:
        print( "Please change a begin word." )
        return None
    word = word.lower()
    for i in range( num ):
        sentence = sentence + word + " "
        # 采用随机样本,将该条件下频次最高的前三个样本取出,并随机抽取其中一个,返回值为一个元组(单词,频次),最后用0下标取出单词
        temp_list = sorted( cfd_dict[word], key=lambda x: x[1], reverse=True )
        word = random.choice( temp_list[:3] )[0]
        # 若抽取到结束符则重新抽取,因为设计的二元组中没有以</s>作为起始的
        while word == "</s>":
            word = random.choice( temp_list[:3] )[0]
    return sentence
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

海绵大星星

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值