1。首先建立词典。此处词典要理解为:对训练语料库中的词进行词频等信息的统计后形成的数据结构,和“新华字典”中的字典意义不一样。我的实现中建立了两个词典:“单词”词典统计每个词的出现次数,“双词”词典统计每两个词连续出现的次数(因为采用的是二元语法模型)。然后又分别对单词词典,双词词典形成一级Trie树,或者叫做“带首字索引”的字典。
代码如下
从训练语料库建立“单词”词典
#
-*- coding: cp936 -*-
import re
import cPickle as mypickle
def datafile(name,sep = ' | ' ):
for line in file(name):
yield line.split(sep)
candidates = datafile(r ' C:\Python26\Bigramwordsegemtation\data\training.txt ' )
p1 = re.compile( ' (^\s+|\s+$) ' )
p2 = re.compile( ' \d ' )
# p3=re.compile('\s+')
mySingleWordDict = {}
# myDoubleWordDict={}
for m in candidates:
# singleline=[]
for e in m:
e = p1.sub( '' ,e)
if p2.match(e):
# e=p3.sub('_',e)
mySingleWordDict[e] = float(mySingleWordDict.get(e,0) + 1 )
print ' 词为%s,个数为%s ' % (e,mySingleWordDict[e])
N = sum(mySingleWordDict.itervalues())
for key in mySingleWordDict.iterkeys():
mySingleWordDict[key] = mySingleWordDict[key] / N
# for m in mySingleWordDict.iteritems():
# print m
fid = file( ' SingleWordDictionaryCrossValidation.dat ' , ' w ' )
mypickle.dump(mySingleWordDict,fid)
fid.close()
print ' finish '
print N
import re
import cPickle as mypickle
def datafile(name,sep = ' | ' ):
for line in file(name):
yield line.split(sep)
candidates = datafile(r ' C:\Python26\Bigramwordsegemtation\data\training.txt ' )
p1 = re.compile( ' (^\s+|\s+$) ' )
p2 = re.compile( ' \d ' )
# p3=re.compile('\s+')
mySingleWordDict = {}
# myDoubleWordDict={}
for m in candidates:
# singleline=[]
for e in m:
e = p1.sub( '' ,e)
if p2.match(e):
# e=p3.sub('_',e)
mySingleWordDict[e] = float(mySingleWordDict.get(e,0) + 1 )
print ' 词为%s,个数为%s ' % (e,mySingleWordDict[e])
N = sum(mySingleWordDict.itervalues())
for key in mySingleWordDict.iterkeys():
mySingleWordDict[key] = mySingleWordDict[key] / N
# for m in mySingleWordDict.iteritems():
# print m
fid = file( ' SingleWordDictionaryCrossValidation.dat ' , ' w ' )
mypickle.dump(mySingleWordDict,fid)
fid.close()
print ' finish '
print N
从训练语料库建立"双词"词典
#
-*- coding: cp936 -*-
import re
import cPickle as mypickle
delimiter = ' | '
def datafile(name,sep = ' | ' ):
''' use generator to create a iterable object '''
for line in file(name):
yield line.split(sep)
candidates = datafile(r ' c:\python26\Bigramwordsegemtation\data\training.txt ' )
p1 = re.compile( ' (^\s+|\s+$) ' )
p2 = re.compile( ' \d ' )
myDoubleWordDict = {}
for m in candidates:
singleline = []
for e in m:
e = p1.sub( '' ,e)
if p2.match(e):
singleline.append(e)
if len(singleline) >= 2 :
initial = singleline[0] + delimiter + ' S '
myDoubleWordDict[initial] = float(myDoubleWordDict.get(initial,0) + 1 )
print ' 词为%s,个数为%s ' % (initial,myDoubleWordDict[initial])
for i in range(0,len(singleline) - 1 ):
c = delimiter.join(singleline[i:i + 2 ])
myDoubleWordDict[c] = float(myDoubleWordDict.get(c,0) + 1 )
print ' 词为%s,个数为%s ' % (c,myDoubleWordDict[c])
N = sum(myDoubleWordDict.itervalues())
for key in myDoubleWordDict.iterkeys():
myDoubleWordDict[key] = myDoubleWordDict[key] / N
# for m in myDoubleWordDict.iteritems():
# print m
# print N
fid = file( ' DoubleWordDictionaryCrossValidation2.dat ' , ' w ' )
mypickle.dump(myDoubleWordDict,fid)
fid.close()
print ' finish '
print N
import re
import cPickle as mypickle
delimiter = ' | '
def datafile(name,sep = ' | ' ):
''' use generator to create a iterable object '''
for line in file(name):
yield line.split(sep)
candidates = datafile(r ' c:\python26\Bigramwordsegemtation\data\training.txt ' )
p1 = re.compile( ' (^\s+|\s+$) ' )
p2 = re.compile( ' \d ' )
myDoubleWordDict = {}
for m in candidates:
singleline = []
for e in m:
e = p1.sub( '' ,e)
if p2.match(e):
singleline.append(e)
if len(singleline) >= 2 :
initial = singleline[0] + delimiter + ' S '
myDoubleWordDict[initial] = float(myDoubleWordDict.get(initial,0) + 1 )
print ' 词为%s,个数为%s ' % (initial,myDoubleWordDict[initial])
for i in range(0,len(singleline) - 1 ):
c = delimiter.join(singleline[i:i + 2 ])
myDoubleWordDict[c] = float(myDoubleWordDict.get(c,0) + 1 )
print ' 词为%s,个数为%s ' % (c,myDoubleWordDict[c])
N = sum(myDoubleWordDict.itervalues())
for key in myDoubleWordDict.iterkeys():
myDoubleWordDict[key] = myDoubleWordDict[key] / N
# for m in myDoubleWordDict.iteritems():
# print m
# print N
fid = file( ' DoubleWordDictionaryCrossValidation2.dat ' , ' w ' )
mypickle.dump(myDoubleWordDict,fid)
fid.close()
print ' finish '
print N
从词典建立一级Trie树
#
-*- coding: cp936 -*-
import re
import cPickle as mypickle
p = re.compile( ' \d+ ' )
myDict = mypickle.load(file( ' DoubleWordDictionaryCrossValidation.dat ' ))
myTrie = {}
# 二级Trie树,词按首字归类
for key in myDict.iterkeys():
tmp = p.findall(key)
if myTrie.get(tmp[0]) == None:
myTrie[tmp[0]] = {}
for (key,val) in myDict.iteritems():
tmp = p.findall(key)
myTrie[tmp[0]][key] = val
print ' 一级键%s二级键%s值%s ' % (tmp[0],key,val)
fid = file( ' myDoubleWordTrieCrossValidation.dat ' , ' w ' )
mypickle.dump(myTrie,fid)
fid.close()
print ' finish '
import re
import cPickle as mypickle
p = re.compile( ' \d+ ' )
myDict = mypickle.load(file( ' DoubleWordDictionaryCrossValidation.dat ' ))
myTrie = {}
# 二级Trie树,词按首字归类
for key in myDict.iterkeys():
tmp = p.findall(key)
if myTrie.get(tmp[0]) == None:
myTrie[tmp[0]] = {}
for (key,val) in myDict.iteritems():
tmp = p.findall(key)
myTrie[tmp[0]][key] = val
print ' 一级键%s二级键%s值%s ' % (tmp[0],key,val)
fid = file( ' myDoubleWordTrieCrossValidation.dat ' , ' w ' )
mypickle.dump(myTrie,fid)
fid.close()
print ' finish '
下一部分是主算法模块,在主算法模块,我们调用的数据结构为“单词”的一级Trie树词典,与“双词”的一级Trie树字典