文本自动分词
基于正向最大匹配算法对文本进行分词
最大匹配法:最大匹配是指以词典为依据,取词典中最长单词为第一个次取字数量的扫描串,在词典中进行扫描(为提升扫描效率,还可以跟据字数多少设计多个字典,然后根据字数分别从不同字典中进行扫描)。例如:词典中最长词为“中华人民共和国”共7个汉字,则最大匹配起始字数为7个汉字。然后逐字递减,在对应的词典中进行查找。
python代码如下:
import time
import datetime
test_file = '词典.txt' # 词典
test_file2 = '分词测试(1).txt' # 测试文件
test_file3 = '正向分词结果.txt'
#def get_dic(test_file):
# with open(test_file,'r',encoding='gb18030',) as f:
# try:
# file_content = f.read().split()
# finally:
# f.close()
# chars = list(set(file_content))
# return chars
with open(test_file,'r',encoding='gb18030',) as f:
dic = f.read()
def readfile(test_file2):
max_length = 5
num = 0
h = open(test_file3, 'w', encoding='gb18030', )
with open(test_file2, 'r', encoding='gb18030', ) as g:
lines = g.readlines()
start = time.time()
print(datetime.datetime.now())
for line in lines:
max_length = 5
my_list = []
len_hang = len(line)
while len_hang > 0:
tryWord = line[0:max_length]
while tryWord not in dic:
if len(tryWord) == 1:
break
tryWord = tryWord[0:len(tryWord) - 1]
my_list.append(tryWord)
line = line[len(tryWord):]
len_hang = len(line)
for t in my_list:
num += 1
if t == '\n':
h.write('\n')
elif t in range(9):
h.write(t)
else:
h.write(t + " ")
end = time.time()
print(datetime.datetime.now())
h.close()
g.close()
print('运行时间为:%.2f' %(end - start))
print('分词数量为:',num)
p = float(num/(end-start))
print('效率为:%.2f s/k' %(p))
readfile(test_file2)
正向分词结果:
基于反向最大匹配算法对文本进行分词
test_file = '搜狗标准词库.txt' # 词典
test_file2 = '分词测试(1).txt' # 测试文件
test_file3 = '反向分词结果.txt'
def get_dic(test_file):
with open(test_file,'r',encoding='gb18030',) as f:
try:
file_content = f.read().split()
finally:
f.close()
chars = list(set(file_content))
return chars
dic = get_dic(test_file)
def readfile(test_file2):
max_length = 5
h = open(test_file3, 'w', encoding='gb18030', )
with open(test_file2, 'r', encoding='gb18030', ) as g:
lines = g.readlines()
for line in lines:
my_stack = []
len_hang = len(line)
while len_hang > 0 :
tryWord = line[-max_length:]
while tryWord