目录
一、统计语言模型简介
1.语言模型任务
2.语言模型实例
3.语言模型任务
3.1贝叶斯公式
3.2概率基础知识
3.3语言模型定义
计算实例
3.4语言模型总结
3.5语言模型改进方法
4.语言模型使用实例
In[1]:
from collections import Counter
import numpy as np
"""语料"""
corpus = '''她的菜很好 她的菜很香 她的他很好 他的菜很香 他的她很好
很香的菜 很好的她 很菜的他 她的好 菜的香 他的菜 她很好 他很菜 菜很好'''.split()
"""语料预处理"""
counter = Counter() # 词频统计
for sentence in corpus:
for word in sentence:
counter[word] += 1
counter = counter.most_common()
lec = len(counter)
word2id = {counter[i][0]: i for i in range(lec)}
id2word = {i: w for w, i in word2id.items()}
print(word2id)
print(id2word)
输出:
{'的': 0, '很': 1, '菜': 2, '她': 3, '好': 4, '他': 5, '香': 6}
{0: '的', 1: '很', 2: '菜', 3: '她', 4: '好', 5: '他', 6: '香'}
In[2]:
"""N-gram建模训练"""
unigram = np.array([i[1] for i in counter]) / sum(i[1] for i in counter)
print(unigram)
bigram = np.zeros((lec, lec)) # + 1e-8
for sentence in corpus:
sentence = [word2id[w] for w in sentence]
print(sentence)
for i in range(1, len(sentence)):
bigram[[sentence[i - 1]], [sentence[i]]] += 1 # 对应词表位置词频加一(第一个词是3并且第二个词是0的情况词频加一)
for i in range(lec):
bigram[i] /= bigram[i].sum() # 对词频归一化,变为概率
print(bigram)
输出:
[0.2 0.2 0.16363636 0.12727273 0.12727273 0.10909091
0.07272727]
[3, 0, 2, 1, 4]
[3, 0, 2, 1, 6]
[3, 0, 5, 1, 4]
[5, 0, 2, 1, 6]
[5, 0, 3, 1, 4]
[1, 6, 0, 2]
[1, 4, 0, 3]
[1, 2, 0, 5]
[3, 0, 4]
[2, 0, 6]
[5, 0, 2]
[3, 1, 4]
[5, 1, 2]
[2, 1, 4]
[[0. 0. 0.45454545 0.18181818 0.09090909 0.18181818
0.09090909]
[0. 0. 0.18181818 0. 0.54545455 0.
0.27272727]
[0.33333333 0.66666667 0. 0. 0. 0.
0. ]
[0.66666667 0.33333333 0. 0. 0. 0.
0. ]
[1. 0. 0. 0. 0. 0.
0. ]
[0.6 0.4 0. 0. 0. 0.
0. ]
[1. 0. 0. 0. 0. 0.
0. ]]
In[3]:
"""句子概率"""
def prob(sentence):
s = [word2id[w] for w in sentence]
les = len(s)
if les < 1:
return 0
p = unigram[s[0]]
if les < 2:
return p
for i in range(1, les):
p *= bigram[s[i - 1], s[i]]
return p
print('很好的菜', prob('很好的菜'))
print('菜很好的', prob('菜很好的'))
print('菜好的很', prob('菜好的很'))
输出:
很好的菜 0.04958677685950413
菜很好的 0.05950413223140495
菜好的很 0.0
In[4]:
"""排列组合"""
def permutation_and_combination(ls_ori, ls_all=None):
ls_all = ls_all or [[]]
le = len(ls_ori)
if le == 1:
ls_all[-1].append(ls_ori[0])
ls_all.append(ls_all[-1][: -2])
return ls_all
for i in range(le):
ls, lsi = ls_ori[:i] + ls_ori[i + 1:], ls_ori[i]
ls_all[-1].append(lsi)
ls_all = permutation_and_combination(ls, ls_all)
if ls_all[-1]:
ls_all[-1].pop()
else:
ls_all.pop()
return ls_all
print('123排列组合', permutation_and_combination([1, 2, 3]))
"""给定词组,返回最大概率组合的句子"""
def max_prob(words):
pc = permutation_and_combination(words) # 生成排列组合
p, w = max((prob(s), s) for s in pc)
return p, ''.join(w)
print(*max_prob(list('香很的菜')))
print(*max_prob(list('好很的他菜')))
print(*max_prob(list('好很的的她菜')))
输出:
123排列组合 [[1, 2, 3], [1, 3, 2], [2, 1, 3], [2, 3, 1], [3, 1, 2], [3, 2, 1]]
0.029752066115702476 菜很香的
0.01081893313298272 菜很好的他
0.014024542950162781 她的菜很好的
二、语言模型任务评估
1.信息论-信息熵
2.相对熵
3.交叉熵
4.困惑度
(困惑度这部分的内容如后面实际应用中使用到,需要详细听,当前跳过了)
5.传统语言模型的限制
三、神经语言模型简介
1.神经语言模型
one-hot作为隐藏层
数学公式总结
2.程序实现
代码:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
from torch.autograd import Variable
dtype = torch.FloatTensor
sentences = ["i like dog", "i love coffee", "i hate milk", "i do nlp"] # 语料库
word_list = ' '.join(sentences).split() # 词库,将每句话以空格组合,再以词为单位分割
word_list = list(set(word_list)) # 转化为集合的类型即可去重,再转化为列表的变量类型,方便后续处理
word_dict = {w: i for i, w in enumerate(word_list)} # 构建字典
number_dict = {i: w for i, w in enumerate(word_list)}
# print(word_dict)
n_class = len(word_dict)
m = 2
n_step = 2
n_hidden = 2
def make_batch(sentence):
input_batch = []
target_batch = []
for sen in sentence:
word = sen.split()
input = [word_dict[n] for n in word[:-1]]
target = word_dict[word[-1]]
input_batch.append(input)
target_batch.append(target)
return input_batch, target_batch
class NNLM(nn.Module):
def __init__(self):
super(NNLM, self).__init__()
self.embed = nn.Embedding(n_class, m) # m为词向量的维度
self.W = nn.Parameter(torch.randn(n_step * m, n_hidden).type(dtype))
self.d = nn.Parameter(torch.randn(n_hidden).type(dtype))
self.U = nn.Parameter(torch.randn(n_hidden, n_class).type(dtype))
self.b = nn.Parameter(torch.randn(n_class).type(dtype))
def forward(self, x):
x = self.embed(x) # 4 x 2 x 2
x = x.view(-1, n_step * m)
tanh = torch.tanh(self.d + torch.mm(x, self.W)) # 4 x 2
output = self.b + torch.mm(tanh, self.U)
return output
0
model = NNLM()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
input_batch, target_batch = make_batch(sentences)
input_batch = Variable(torch.LongTensor(input_batch))
target_batch = Variable(torch.LongTensor(target_batch))
for epoch in range(5000):
optimizer.zero_grad()
output = model(input_batch) # input: 4 x 2
loss = criterion(output, target_batch)
if (epoch + 1) % 1000 == 0:
print('epoch:', '%04d' % (epoch + 1), 'cost = {:.6f}'.format(loss.item()))
loss.backward()
optimizer.step()
predict = model(input_batch).data.max(1, keepdim=True)[1]
print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()])
3.神经语言模型的局限性
四、预训练的词表示
1.获取已有的预训练词表示
链接如下:
https://www.jianshu.com/p/5ec168b2ce5c
https://huggingface.co/transformers/quicktour.html
2.使用词表示
输入:
# -*- coding: utf-8 -*-
import jieba
import jieba.analyse
jieba.suggest_freq('沙瑞金', True) # 将人名添加进去,防止人名错分
jieba.suggest_freq('田国富', True)
jieba.suggest_freq('高育良', True)
jieba.suggest_freq('侯亮平', True)
jieba.suggest_freq('钟小艾', True)
jieba.suggest_freq('陈岩石', True)
jieba.suggest_freq('欧阳菁', True)
jieba.suggest_freq('易学习', True)
jieba.suggest_freq('王大路', True)
jieba.suggest_freq('蔡成功', True)
jieba.suggest_freq('孙连城', True)
jieba.suggest_freq('季昌明', True)
jieba.suggest_freq('丁义珍', True)
jieba.suggest_freq('郑西坡', True)
jieba.suggest_freq('赵东来', True)
jieba.suggest_freq('高小琴', True)
jieba.suggest_freq('赵瑞龙', True)
jieba.suggest_freq('林华华', True)
jieba.suggest_freq('陆亦可', True)
jieba.suggest_freq('刘新建', True)
jieba.suggest_freq('刘庆祝', True)
with open('./in_the_name_of_people.txt', 'r', encoding='utf-8') as f:
document = f.read()
#document_decode = document.decode('GBK')
document_cut = jieba.cut(document)
#print ' '.join(jieba_cut) //如果打印结果,则分词效果消失,后面的result无法显示
result = ' '.join(document_cut)
result = result
with open('./in_the_name_of_people_segment.txt', 'w', encoding='utf-8') as f2:
f2.write(result)
f.close()
f2.close()
# import modules & set up logging
import logging
import os
from gensim.models import word2vec
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
sentences = word2vec.LineSentence('./in_the_name_of_people_segment.txt')
model = word2vec.Word2Vec(sentences, hs=1,min_count=1,window=3,size=100)
训练输出:
2020-08-25 04:40:23,250 : INFO : collecting all words and their counts
2020-08-25 04:40:23,257 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-08-25 04:40:23,295 : INFO : collected 17878 word types from a corpus of 161343 raw words and 2311 sentences
2020-08-25 04:40:23,295 : INFO : Loading a fresh vocabulary
2020-08-25 04:40:23,348 : INFO : effective_min_count=1 retains 17878 unique words (100% of original 17878, drops 0)
2020-08-25 04:40:23,349 : INFO : effective_min_count=1 leaves 161343 word corpus (100% of original 161343, drops 0)
2020-08-25 04:40:23,383 : INFO : deleting the raw counts dictionary of 17878 items
2020-08-25 04:40:23,384 : INFO : sample=0.001 downsamples 38 most-common words
2020-08-25 04:40:23,384 : INFO : downsampling leaves estimated 120578 word corpus (74.7% of prior 161343)
2020-08-25 04:40:23,393 : INFO : constructing a huffman tree from 17878 words
2020-08-25 04:40:23,692 : INFO : built huffman tree with maximum node depth 17
2020-08-25 04:40:23,717 : INFO : estimated required memory for 17878 words and 100 dimensions: 33968200 bytes
2020-08-25 04:40:23,717 : INFO : resetting layer weights
2020-08-25 04:40:26,128 : INFO : training model with 3 workers on 17878 vocabulary and 100 features, using sg=0 hs=1 sample=0.001 negative=5 window=3
2020-08-25 04:40:26,263 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-25 04:40:26,265 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-25 04:40:26,273 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-25 04:40:26,274 : INFO : EPOCH - 1 : training on 161343 raw words (120329 effective words) took 0.1s, 831963 effective words/s
2020-08-25 04:40:26,413 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-25 04:40:26,415 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-25 04:40:26,422 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-25 04:40:26,423 : INFO : EPOCH - 2 : training on 161343 raw words (120484 effective words) took 0.1s, 821510 effective words/s
2020-08-25 04:40:26,566 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-25 04:40:26,569 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-25 04:40:26,572 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-25 04:40:26,573 : INFO : EPOCH - 3 : training on 161343 raw words (120772 effective words) took 0.1s, 810055 effective words/s
2020-08-25 04:40:26,707 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-25 04:40:26,709 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-25 04:40:26,725 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-25 04:40:26,725 : INFO : EPOCH - 4 : training on 161343 raw words (120451 effective words) took 0.2s, 799121 effective words/s
2020-08-25 04:40:26,861 : INFO : worker thread finished; awaiting finish of 2 more threads
2020-08-25 04:40:26,861 : INFO : worker thread finished; awaiting finish of 1 more threads
2020-08-25 04:40:26,873 : INFO : worker thread finished; awaiting finish of 0 more threads
2020-08-25 04:40:26,873 : INFO : EPOCH - 5 : training on 161343 raw words (120558 effective words) took 0.1s, 820281 effective words/s
2020-08-25 04:40:26,873 : INFO : training on a 806715 raw words (602594 effective words) took 0.7s, 808256 effective words/s
输入:
req_count = 5
for key in model.wv.similar_by_word('李达康', topn =100):
if len(key[0])==3:
req_count -= 1
print(key[0], key[1])
if req_count == 0:
break;
输出:
2020-08-25 04:41:15,769 : INFO : precomputing L2-norms of word weight vectors
赵东来 0.9700594544410706
侯亮平 0.9671393036842346
蔡成功 0.9631752371788025
陆亦可 0.9626104831695557
祁同伟 0.9611712098121643
输入:
req_count = 5
for key in model.wv.similar_by_word('赵东来', topn =100):
if len(key[0])==3:
req_count -= 1
print(key[0], key[1])
if req_count == 0:
break;
输出:
李达康 0.9700593948364258
祁同伟 0.9673236608505249
陆亦可 0.9645941257476807
易学习 0.9624121189117432
敢肯定 0.9594728350639343
输入:
req_count = 5
for key in model.wv.similar_by_word('高育良', topn =100):
if len(key[0])==3:
req_count -= 1
print(key[0], key[1])
if req_count == 0:
break;
输出:
沙瑞金 0.9710817337036133
侯亮平 0.9524528980255127
李达康 0.9302777647972107
陆亦可 0.9274569749832153
季昌明 0.9258953332901001
输入:
req_count = 5
for key in model.wv.similar_by_word('沙瑞金', topn =100):
if len(key[0])==3:
req_count -= 1
print(key[0], key[1])
if req_count == 0:
break;
输出:
高育良 0.9710817337036133
侯亮平 0.9542391300201416
易学习 0.9505879878997803
李达康 0.942894458770752
陆亦可 0.9374306797981262
输入:
print(model.wv.similarity('沙瑞金', '高育良'))
print(model.wv.similarity('李达康', '王大路'))
输出:
0.97108173
0.9553164
输入:
print(model.wv.doesnt_match(u"沙瑞金 高育良 李达康 刘庆祝".split()))
输出:
刘庆祝