# -*- coding: utf-8 -*-
import time
dictionaryfilename = "../data/dic.txt"
inputfilename = "../data/199801_sent.txt"
BMMfilename = "../data/seg_BMM2.txt"
FMMfilename = "../data/seg_FMM.txt"
def readDic(dicpath=dictionaryfilename):
tripleDic = []
singleDic = []
fd = open(dicpath, 'r', encoding='gbk')
for line in fd:
words = line.split("\t")
triple = (words[0], int(words[1]))
singleDic.append(words[0])
tripleDic.append(triple)
fd.close()
return singleDic, tripleDic
def isExist(dic, word):
low = 0
high = len(dic) - 1
flag = False
while low <= high:
mid = int((low+high)/2)
try:
if dic[mid] == word:
flag = True
break
elif dic[mid] > word:
high = mid - 1
else:
low = mid + 1
except:
print(mid)
break
return flag
def FMM(dic, fmmpath=FMMfilename, inputpath=inputfilename):
start = time.process_time()
lenth = 0
for word in dic:
if lenth < len(word):
lenth = len(word)
end = time.process_time()
dic.sort()
print("建立词典耗时",end-start)
fi = open(inputpath, 'r', encoding='gbk')
fo = open(fmmpath, 'w')
start = time.process_time()
for line in fi:
segList = []
if line != '\n':
segList.append(line[0:19])
i = 19
while i < len(line):
j = min(i+lenth, len(line))
while j-i > 0:
if j == (i+1):
segList.append(line[i:j])
i = j
break
else:
tmp = line[i:j]
if tmp in dic:
segList.append(tmp)
i = j
break
else:
j -= 1
segList = segList[:-1]
for word in segList:
fo.write(word+'/')
fo.write('\n')
end = time.process_time()
print("正向最大匹配耗时",end-start)
fo.close()
def BMM(dic,bmmpath=BMMfilename,inputpath=inputfilename):
start = time.process_time()
lenth = 0
for word in dic:
if lenth < len(word):
lenth = len(word)
end = time.process_time()
dic.sort()
print("建立词典耗时",end-start)
fi = open(inputpath,'r',encoding='gbk')
fo = open(bmmpath,'w')
start = time.process_time()
for line in fi:
segList = []
if line!='\n':
line = line[:-1]
segList.append(line[0:19])
i = len(line)
while(i>19):
j = max(i-lenth,0)
while(i-j>0):
if i==(j+1) :
segList.append(line[j:i])
i = j
break
else:
tmp = line[j:i]
if tmp in dic:
segList.append(tmp)
i = j
break
else:
j += 1
if len(segList)==0:
continue
fo.write(segList[0]+'/')
#print(segList)
for i in range(len(segList)-1,0,-1):
fo.write(segList[i]+'/')
fo.write('\n')
end = time.process_time()
print("反向最大匹配耗时",end-start)
fo.close()
if __name__=='__main__':
single,triple = readDic()
FMM(single)
BMM(single)
3.2FMM&GMM
最新推荐文章于 2024-04-30 19:24:04 发布