作业要求 : 读入中文语料库(了不起的盖茨比)gaicibi.txt,对文本进行预处理操作,包括分词,换行,删除空格,符号等。统计语料库中的每个词的unigram,bigram,trigram的概率,以及测试句子在unigram,bigramm,trigram下的概率。
导入库
import re
from collections import defaultdict
import jieba
读取语料库
with open('gaicibi.txt', 'r', encoding='utf-8') as file:
text = file.read()
# print(text)
文本预处理
- 删除多余空白符 :!。?等中文符号分隔句子,加入句子的开始和结束标志
- 使用jieba库完成分词,并统计词表
# 去除非中文字符
pattern = r'[^\u4e00-\u9fa5\s]'
text = re.sub(pattern, '', text)
# 将多个空白符统一用一个换行符替换
pattern = r'\s+'
text = re.sub(pattern, '\n', text)
# 以换行符作为句子分割单位,句子用jieba.lcut分词,前后加入开始<sos>和结束标志<eos>
sentences = re.split(r"\n",text)
cutted_words = []
for sentence in sentences:
sentence = jieba.lcut(sentence)
sentence.insert(0,"<BOS>")
sentence.append("<EOS>")
cutted_words.append(sentence)
# print(cutted_words)
# 词表统计
words = {}
for sentence in sentences:
for item in sentence:
words[item] = words.get(item,0) + 1
print(len(words))
N-grams语言模型
计算n元词组的出现次数
single_word = {}
double_word = {}
trible_word = {}
for line in cutted_words:
for i in range(2,len(line)):
# 三元
item = line[i] + line[i - 1] + line[i - 2]
trible_word[item] = trible_word.get(item,0) + 1
for i in range(1, len(line)):
# 二元
item = line[i] + line[i - 1]
double_word[item] = double_word.get(item,0) + 1
for s in line:
# 一元
single_word[s] = single_word.get(s,0) + 1
# print(single_word)
# print(double_word)
# print(trible_word)
定义一个函数计算句子的n-gram-probability;进行加1平滑
def ngram_prob(sentence):
sentence = jieba.lcut(sentence)
sentence.insert(0,"<BOS>")
sentence.append("<EOS>")
n = len(sentence)
single_prob = 1
for i in range(n):
item = sentence[i]
single_prob *= single_word.get(item,1) / (2 * len(single_word))
double_prob = 1
for i in range(1,n):
item = sentence[i - 1] + sentence[i]
double_prob *= double_word.get(item,1) / (2 * len(double_word))
trible_prob = 1
for i in range(2,n):
item = sentence[i - 2] + sentence[i - 1] + sentence[i]
trible_prob *= trible_word.get(item,1) / (2 * len(trible_word))
return single_prob,double_prob,trible_prob
sentences = ["我是盖茨比","我喜欢丽莎","盖茨比的名字是什么"]
for sentence in sentences:
print(ngram_prob(sentence))
输出:
(7.061443970243689e-09, 1.416595693431641e-23, 1.0684673188093855e-20)
(6.48782640384386e-11, 3.461782115895349e-20, 1.0509230854345353e-15)
(2.3836563959058e-13, 1.7571325467189906e-34, 1.1044393314992436e-30)
基于2-gram 模型生成句子
def model_2gram(prefix):
# 基于二元模型预测生成的句子
prefix = jieba.lcut(prefix)
if len(prefix) >= 1:
bigram_maxprob = {}
else:
print("输入文本太短")
return ""
for word in words.keys():
item=prefix[-1] + word
bigram_maxprob[word] = double_word.get(item,1) / len(double_word) * 2
next = (sorted(bigram_maxprob.items(),key = lambda item:item[1],reverse = True))[0][0]
return next
prefix ="我喜欢"
out = ""
while len(out)<1 or out != '<EOS>':
out = model_2gram(prefix)
prefix = prefix+out
if len(prefix) > 100:
break
print(prefix)
输出:
我喜欢我了过见去里了过见去里了过见去里了过见去里了过见去里了过见去里了过见去里了过见去里了过见去里了过见去里了过见去里了过见去里了过见去里了过见去里了过见去里了过见去里了过见去里了过见去里了过见去里了过
# 基于bigram语言模型,用ppl(困惑度)评估生成文本的质量
import math
# 计算句子的二元模型概率
def ngram_prob(sentence):
sentence = jieba.lcut(sentence)
sentence.insert(0, "<BOS>")
sentence.append("<EOS>")
n = len(sentence)
double_prob = 1
for i in range(1, n):
item = sentence[i - 1] + sentence[i]
double_prob *= double_word.get(item, 1) / (2 * len(double_word))
return double_prob
total_log_prob = 0
num_tokens = 0
for sentence in sentences:
prob = ngram_prob(sentence)
# 计算句子中的标记数(不包括开始和结束标志)
num_tokens += len(jieba.lcut(sentence)) - 2
# 计算句子的对数概率并累加
total_log_prob += math.log(prob)
# 计算困惑度
ppl = math.pow(2, - total_log_prob / num_tokens)
print(ppl)
输出:
34040791.757951364