ngram语言模型—基于Kneser Ney及Modified Kneser Ney平滑
参考NLTK源码编写的更加清爽的基于KneserNey平滑的 字粒度 ngram模型。
预处理
用到的库 以及 预处理语料。 清除所有符号,并分句,分词
import re
import zipfile
import lxml.etree
from collections import defaultdict
from math import log
from nltk.probability import ConditionalFreqDist, FreqDist
import joblib
def pre_data():
"""
获取xml中的有效文本 content 数据 keywords 标签
"""
with zipfile.ZipFile(r'D:\C\NLP\Data\ted_zh-cn-20160408.zip', 'r') as z:
doc = lxml.etree.parse(z.open('ted_zh-cn-20160408.xml', 'r'))
input_text = '\n'.join(doc.xpath('//content/text()')) # 获取<content>标签下的文字
z.close()
del doc, z
input_text_noparens = re.sub(r'\([^)]*\)', '', input_text)
input_text_noparens = re.sub(r'([^)]*)', '', input_text_noparens)
sentences_strings_ted = []
for line in input_text_noparens.split('\n'):
m = re.match(r'^(?:(?P<precolon>[^:]{,20}):)?(?P<postcolon>.*)$', line)
sentences_strings_ted.extend(sent for sent in re.split('[。?!]', m.groupdict()['postcolon']) if sent)
del input_text_noparens, input_text
sentences_strings_ted = [re.sub(r'[^\w\s]', '', sent) for sent in sentences_strings_ted]
sentences_strings_ted = [re.sub(r'[a-zA-Z0-9]', '', sent) for sent in sentences_strings_ted]
sentences_strings_ted = filter(None, sentences_strings_ted)
data = ' '.join([re.sub(r'\s', '', sent) for sent in sentences_strings_ted]).split(' ')
fin_data = [' '.join(sent).split(' ') for sent in data]
del sentences_strings_ted, data
return fin_data
NGram 建模
其实就是统计所有 1-n 元 ngram
重点函数
self.counter[n][gram[:-1]][gram[-1]] 套嵌字典
n =[1:n] 代表几元组
[gram[:-1]] 代表当前元组的
[gram[-1]] 代表$(w_{i-n+1},…w_{i-1}) $ 后存在的 ( w ′ ) (w') (w′)
value 为当前元组 ( w 1 , w 2 , . . . , w n ) (w_{1},w_{2},...,w_{n}) (w1,w2,...,wn) 的总数
例:self.counter=
3:[(a, b):[c:5, d:6, e:7], (a, c):[c:4, d:5, e:8, f:1, g:10]]
2:[(a):[b:6,c:6,d:7], (b):[y:1]]
1: a:23,b:21,c:10,d:20
class NGram:
def __init__(self, n):
"""
定义N元模型参数
nltk的 ConditionalFreqDist 很好用就没有复写
参考 from nltk.probability import ConditionalFreqDist, FreqDist
@param n: 定义 ngram 元
"""
self.N = n
self.counter = defaultdict(ConditionalFreqDist)
self.counter[1] = self.unigrams = FreqDist()
def prepare(self, sents):
"""
准备数据 分句在分字,句子头尾增加<BOS><EOS>
@return:
"""
n = self.N
left = ['<BOS>']
right = ['<EOS>']
sents = list(left * (n - 1) + sent + right * (n - 1)for sent in sents)
return sents
def fit(self, sents):
"""
训练函数 其实就是统计所有 1-n 元 ngram
self.counter[n][gram[:-1]][gram[-1]]
n =[1:n] 代表几元组
[gram[:-1]] 代表当前元组的(wi-n-1,...wi-1)
[gram[-1]]代表(wi-n-1,...wi-1)后存在的(w`)
例:self.counter=
3:[(a, b):[c, d, e], (a, c):[c, d, e]]
2:[(a):[b,c,d]]
1:a,b,c,d
@param sents: 输入形式[[1,2,3,4,5],[6,7,8,9],[10,11]] 字粒度
@return:
"""
ready = self.prepare(sents)
n = 1
while n <= self.N:
for sent in ready:
for i in range(len(sent) - n + 1):
gram = tuple(sent[i:i + n])
if n == 1:
self.unigrams[gram[0]] += 1
continue
self.counter[n][gram[:-1]][gram[-1]] += 1
n += 1
self.d() # modified_alpha_gamma 中使用
KneserNey 平滑
bigram 的插值 Interpolation Kneser-Ney Smoothing 公式
P K N ( w i ∣ w i − 1 ) = m a x ( C ( w i − 1 w i ) − d , 0 ) C ( w i − 1 ) + γ ( w i − 1 ) P c o n t i n u t i o n ( w i ) P_{K N}(w_{i} | w_{i-1})=\frac{max (C(w_{i-1} w_{i})-d, 0)}{C(w_{i-1})}+\gamma(w_{i-1}) P_{ {continution }}(w_{i}) PKN(wi∣wi−1)=C(wi−1)max(C(wi−1wi)−d,0)+γ(wi−1)Pcontinution(wi)
m a x ( C ( w i − 1 w i ) − d , 0 ) max (C(w_{i-1} w_{i})-d, 0) max(C(wi−1wi)−d,0) 的目的是对 ngram计数 - d 后小于0的值 取0,避免成为复数。
γ \gamma γ为正则化常量。 ∣ { w : C ( w i − 1 , w ) > 0 } ∣ |\{w: C(w_{i-1}, w)>0\}| ∣{
w:C(wi−1,w)>0}∣ 为统计 C ( w i − 1 , w ) C(w_{i-1}, w) C(wi−1,w)的 样本数。
γ ( w i − 1 ) = d C ( w i − 1 ) ∣ { w : C ( w i − 1 , w ) > 0 } ∣ \gamma(w_{i-1})=\frac{d}{C(w_{i-1})} |\{w: C(w_{i-1}, w)>0\}| γ(wi−1)=C(wi−1)d∣{
w:C(wi−1,w)>0}∣
泛化的通用公式为:
P K N ( w i ∣ w i − n + 1 ⋯ w i − 1 ) = m a x ( C K N ( w i − n + 1 ⋯ w i ) − d , 0 ) C K N ( w i − n + 1 ⋯ w i − 1 ) + γ ( w i − n + 1 ⋯ w i − 1 ) ⋅ P K N ( w i ∣ w i − n + 2 ⋯ w i − 1 ) P_{K N}(w_{i} | w_{i-n+1} \cdots w_{i-1})=\frac{max(C_{K N}(w_{i-n+1} \cdots w_{i})-d, 0)}{C_{K N}(w_{i-n+1} \cdots w_{i-1})}+\gamma(w_{i-n+1} \cdots w_{i-1}) \cdot P_{K N}(w_{i} | w_{i-n+2} \cdots w_{i-1}) PKN(wi∣wi−n+1⋯wi−1)=CKN(wi−n+1⋯wi−1)max(CKN(wi−n+1⋯wi)−d,0)+γ(wi−n+1⋯wi−1)⋅PKN(wi∣w