def unigram_MLE(train_corpus, test_corpus):
"""最大似然估计法估算一元语法概率,简单地将每个单词出现的频率相乘(P(w1w2w3)=p(w1)*p(w2)*p(w3)),返回该频率"""
train_tokens = preprocess( train_corpus )
test_tokens = preprocess( test_corpus )
# 替换未知单词
train_tokens, test_tokens = sub_unknown( train_tokens, test_tokens )
print( train_tokens )
print( test_tokens )
# 训练集的一元语法模型
train_unigram = unigrams( train_tokens )
Prob_MLE = 1
for word in test_tokens:
Prob_MLE *= train_unigram[word] / len( train_tokens )
return Prob_MLE
def bigram_MLE(corpus, before, after):
"""最大似然估计法估算二元语法概率,传入语料库、起始单词、后续单词,返回该二元语法的概率(若在训练集中查询不到则直接返回None)"""
# P(wn|wn−1) = C(wn−1:wn)/ C(wn−1)
tokens = preprocess( corpus )
unigram = unigrams( tokens )
bigram = bigrams( tokens )
if unigram.get( before.lower() ) is None or bigram.get( before.lower() + " " + after.lower() ) is None:
return None
Counts_together = bigram[before.lower() + " " + after.lower()]
Counts_before = unigram[before.lower()]
print( "count of '" + before + ' ' + after + "':" + str( Counts_together ) )
print( "count of '" + before + "':" + str( Counts_before ) )
return Counts_together / Counts_before
def bigram_Laplace(train_corpus, test_corpus):
"""带加一平滑的最大似然估计发估算二元语法概率,传入训练语料库、测试集,返回测试集的的对数概率"""
# PLaplace(wn|wn−1) = C(wn−1wn)+1 / C(wn−1)+V (V是训练集中单词种类数量)
# 对数的加法为ln(MN)=ln(M)+ln(N)
# 计算得到的对数概率如果需要转换为原始概率,只需要使用指数函数exp()对结果进行转换
train_tokens = preprocess( train_corpus )
test_tokens = preprocess( test_corpus )
# 替换未知单词
train_tokens, test_tokens = sub_unknown( train_tokens, test_tokens )
# 训练集的unigram和bigram
train_unigram = unigrams( train_tokens )
train_bigram = bigrams( train_tokens )
# 测试集的unigram和bigram
test_unigram = unigrams( test_tokens )
test_bigram = bigrams( test_tokens )
Prob_log = 0
for bigram in test_bigram.keys():
before, after = bigram.split()
# 使用加一平滑进行数据平滑,防止出现局部二元语法概率为0而导致整体估算失误的情况
Counts_together = (train_bigram[before.lower() + " " + after.lower()] if train_bigram.get(
before.lower() + " " + after.lower() ) is not None else 0) + 1
# 分母需要加上训练集中的单词种类个数,以中和所有分子的变动(在原来的稀疏矩阵中,存在与否的cell计数都加1)
Counts_before = train_unigram[before.lower()] + len( train_unigram )
Prob_laplace = Counts_together / Counts_before
Prob_log += math.log( Prob_laplace )
return Prob_log
def unigram_PP(train_corpus, test_corpus):
# PP(W) = pow(P(w1w2...wN),-(1/N))
# P(w1w2w3)=p(w1)*p(w2)*p(w3)
test_tokens = preprocess( test_corpus )
Prob_MLE = unigram_MLE( train_corpus, test_corpus )
return math.pow( Prob_MLE, -(1 / len( test_tokens )) )
def bigram_PP(train_corpus, test_corpus):
# 困惑度 (困惑度越大说明测试集概率越小,说明模型越不理想)
# PP(W) = pow(P(w1w2...wN),-(1/N)) (N是测试集的长度)
test_tokens = preprocess( test_corpus )
Prob_log = bigram_Laplace( train_corpus, test_corpus ) # 测试集的对数概率
Prob_MLE = math.exp( Prob_log ) # 测试集的原始概率
return math.pow( Prob_MLE, -(1 / len( test_tokens )) )
def bigramToCfd(bigrams):
"""传入{'<s> poems': 1, 'poems by': 1}形式的字典,返回{“I”:[(“am”,3),("was",2)]}形式的字典"""
# 原始['<s> i', 'i am', 'am happy', 'happy to']→[('[', 'Poems'), ('Poems', 'by')]
cfd_dict = {}
for bigram, num in bigrams.items():
# print( bigram, num )
before, after = bigram.split()
# 采用字典形式: {“I”:[(“am”,3),("was",2)]}
if before not in cfd_dict:
cfd_dict[before] = [(after, num)] # 初始化列表{“I”:[(“am”,3)]}
else:
cfd_dict[before].append( (after, num) ) # 增加该条件下的其余样本
return cfd_dict
def generate_sentence(cfdist, word, num=10):
"""传入条件概率分布对象(ConditionalFreqDist)、初始词汇、生成句子长度(默认10个词),返回生成的随机句子(字符串)"""
sentence = """"""
# 若该条件下样本总数为0,则说明该单词不适合作为开始词
if cfdist[word].N() == 0:
print( "Please change a begin word." )
return None
for i in range( num ):
sentence = sentence + word + " "
# 当采用最大值策略时,经常出现局部循环现象,例如life , And the night , And the night , And the
# word = cfdist[word].max()
# 采用随机样本,将该条件下频次最高的前三个样本取出,并随机抽取其中一个,返回值为一个元组(单词,频次),最后用0下标取出单词
word = random.choice( cfdist[word].most_common( 3 ) )[0]
return sentence
def random_sentence(cfd_dict, word, num=10):
"""传入二元组分布字典、初始词汇、生成句子长度(默认10个词),返回生成的随机句子(字符串)"""
sentence = """"""
if cfd_dict[word] is None:
print( "Please change a begin word." )
return None
word = word.lower()
for i in range( num ):
sentence = sentence + word + " "
# 采用随机样本,将该条件下频次最高的前三个样本取出,并随机抽取其中一个,返回值为一个元组(单词,频次),最后用0下标取出单词
temp_list = sorted( cfd_dict[word], key=lambda x: x[1], reverse=True )
word = random.choice( temp_list[:3] )[0]
# 若抽取到结束符则重新抽取,因为设计的二元组中没有以</s>作为起始的
while word == "</s>":
word = random.choice( temp_list[:3] )[0]
return sentence
NLP:Python计算句子概率(最大似然估计法,加一平滑,困惑度,随机句子生成)
于 2022-07-05 10:50:43 首次发布