请同学们使用免费的中文分词语料库,如人民日报语料库PKU,使用语料库中的常见词编写一个句子,使用二元语法(即每个词只与和它相邻的前一个词有关)在语料库中对句子中的词进行词频统计,输出句子的出现概率。
with open(r'D:\A学习\自然语言处理\实验\2\training_pair.txt', 'r', encoding='utf-8') as f:
tmp_dict = eval(f.read())
with open(r'D:\A学习\自然语言处理\实验\2\training_one.txt', 'r', encoding='utf-8') as f:
tmp_dict_single = eval(f.read())
sentences = ['BOS 世界 人民 大 团结 万岁 EOS', 'BOS 中文 的 工具 EOS']
for sentence in sentences:
sentence = sentence.split()
sum = 1.0
for index in range(1, len(sentence)):
name = '{}-{}'.format(sentence[index - 1], sentence[index])
try:
print(name, tmp_dict[name] / tmp_dict_single[sentence[index - 1]])
temp = tmp_dict[name] / tmp_dict_single[sentence[index - 1]]
except KeyError:
print(name, 1 / (tmp_dict_single[sentence[index - 1]] + 1))
temp = 1 / (tmp_dict_single[sentence[index - 1]] + 1) # 平滑处理
sum *= temp
print(sum)
print()
dic = {}
dic_single = {}
punctuation = set(['、', '——', ':', '。', '!', '“', '”', '?', '《', '》', '(', ')', '’', ':', ','])
with open(r'D:\A学习\自然语言处理\实验\2\pku_training.txt') as f:
end_with_pun = True
f_last = 'EOS'
for line in f:
begin = True # 每行开始
tmp = line.split() # 去空格
if begin and end_with_pun:
tmp.insert(0, 'BOS')
begin = False
if tmp[-1] in punctuation: # 以标点结束
end_with_pun = True
else:
end_with_pun = False
for index in range(len(tmp) - 1, -1, -1):
if tmp[index] in punctuation:
tmp[index] = 'EOS'
if index != len(tmp) - 1:
tmp.insert(index + 1, 'BOS')
if f_last != 'EOS': # 如果上一行最后不是EOS,则需要插入本行的开始,否则会少一对键
tmp.insert(0, f_last)
for index in range(len(tmp)):
word = tmp[index]
if word == 'EOS':
not_end = False
else:
not_end = True
if index < len(tmp) - 1 and not_end:
word_next = tmp[index + 1]
name = '{}-{}'.format(word, word_next)
if name not in dic:
dic[name] = 1
else:
dic[name] += 1
if word not in dic_single:
dic_single[word] = 1
else:
dic_single[word] += 1
if index == len(tmp) - 1:
f_last = tmp[-1]
print('sum:', sum(dic.values()))
with open(r'D:\A学习\自然语言处理\实验\2\training_pair.txt', 'w', encoding='utf-8') as f:
f.write(str(dic))
with open(r'D:\A学习\自然语言处理\实验\2\training_one.txt', 'w', encoding='utf-8') as f:
f.write(str(dic_single))