1.<beautiful data>中的例子,由于没有中文语料库,故用英文串代替,思路一样(如将finallylast)切分成['finally','last']
2.代码切分模块
![](https://i-blog.csdnimg.cn/blog_migrate/8f900a89c6347c561fdf2122f13be562.gif)
![ExpandedBlockStart.gif](https://i-blog.csdnimg.cn/blog_migrate/961ddebeb323a10fe0623af514929fc1.gif)
import
operator
def segment(text):
" Return a list of words that is the best segmentation of text. "
if not text: return []
candidates = ([first] + segment(rem) for first,rem in splits(text))
return max(candidates, key = Pwords)
def splits(text, L = 20 ):
" Return a list of all possible (first, rem) pairs, len(first)<=L. "
return [(text[:i + 1 ], text[i + 1 :])
for i in range(min(len(text), L))]
def Pwords(words):
" The Naive Bayes probability of a sequence of words. "
return product(Pw(w) for w in words)
def product(nums):
" Return the product of a sequence of numbers. "
return reduce(operator.mul, nums, 1 )
class Pdist(dict):
" A probability distribution estimated from counts in datafile. "
def __init__ (self, data = [], N = None, missingfn = None):
for key,count in data:
self[key] = self.get(key, 0) + int(count)
self.N = float(N or sum(self.itervalues()))
self.missingfn = missingfn or ( lambda k, N: 1 . / N)
def __call__ (self, key):
if key in self: return self[key] / self.N
else : return self.missingfn(key, self.N)
def datafile(name, sep = ' \t ' ):
" Read key,value pairs from file. "
for line in file(name):
yield line.split(sep)
def avoid_long_words(key, N):
" Estimate the probability of an unknown word. "
return 10 . / (N * 10 ** len(key))
N = 1024908267229 # # Number of tokens
Pw = Pdist(datafile(r ' C:\Python26\Myngrams\count_1w.txt ' ), N, avoid_long_words)
def segment(text):
" Return a list of words that is the best segmentation of text. "
if not text: return []
candidates = ([first] + segment(rem) for first,rem in splits(text))
return max(candidates, key = Pwords)
def splits(text, L = 20 ):
" Return a list of all possible (first, rem) pairs, len(first)<=L. "
return [(text[:i + 1 ], text[i + 1 :])
for i in range(min(len(text), L))]
def Pwords(words):
" The Naive Bayes probability of a sequence of words. "
return product(Pw(w) for w in words)
def product(nums):
" Return the product of a sequence of numbers. "
return reduce(operator.mul, nums, 1 )
class Pdist(dict):
" A probability distribution estimated from counts in datafile. "
def __init__ (self, data = [], N = None, missingfn = None):
for key,count in data:
self[key] = self.get(key, 0) + int(count)
self.N = float(N or sum(self.itervalues()))
self.missingfn = missingfn or ( lambda k, N: 1 . / N)
def __call__ (self, key):
if key in self: return self[key] / self.N
else : return self.missingfn(key, self.N)
def datafile(name, sep = ' \t ' ):
" Read key,value pairs from file. "
for line in file(name):
yield line.split(sep)
def avoid_long_words(key, N):
" Estimate the probability of an unknown word. "
return 10 . / (N * 10 ** len(key))
N = 1024908267229 # # Number of tokens
Pw = Pdist(datafile(r ' C:\Python26\Myngrams\count_1w.txt ' ), N, avoid_long_words)
2.注意:在Myngrams 添加一个空的__init__.py
3.验证
from Myngrams import Mysegment
Mysegment.segment('finallylast')
['finally', 'last']
Mysegment.segment('unregardedsitdown')
['un', 'regarded', 'sitdown']
由于训练语料中没有unregarded这个词,加上sitdown当成一个词的概率》P(sit)P(down)所以这个结果分错了。考虑采用二元语法分词