最大匹配分词
任务:
使用最大匹配算法、字典文件(corpus.dict),对语料(corpus.sentence)进行分词,将分词的结果输出到文件corpus.out中;对比corpus.answer和corpus.out,给出算法的P/R/F指标
- 确定文件路径
path = print(os.getcwd()) # 获得文件当前路径
- 拿取语料库内容
with open(path + '\\data\\corpus.sentence.txt', 'r', encoding='utf-8-sig') as sentence: # 打开语料库
lst = []
while True:
line = sentence.readline()
if line == '':
break
else:
lst.append(line[:-1]) # 将语料库中内容存进列表并关闭文件
- 拿取字典内容
with open(path + '\\data\\corpus.dict.txt', 'r', encoding='utf-8-sig') as d: # 打开字典文件
dic = []
while True:
line = d.readline()
if line == '':
break
else:
dic.append(line[:-1]) # 将字典内容存入列表并关闭文件
- 分词核心部分
for item in lst: # 循环语料库中每一句话
start = 0 # 设置启示位置
count = 10 # 设置最大长度
ans = []
while count >= 1: # 设置内循环
if count == start:
break
res = item[start:count] # 截取当前长度字符串
if res in dic: # 判断是否存在在字典中
ans.append(res)
start = count
if 10 + start < len(item): # 防止下标访问越界
count = 10 + start # 重设长度
else:
count = len(item)
else:
if count == start + 1: # 单个字符时处理
ans.append(item[start:count])
start = count
if 10 + start < len(item):
count = 10 + start
else:
count = len(item)
else: # 长度减一
count -= 1
完整代码:
import os
current_path = os.path.abspath(__file__)
path = os.path.abspath(os.path.dirname(current_path) + os.path.sep + ".") # 获取父文件夹路径
with open(path + '\\data\\corpus.sentence.txt', 'r', encoding='utf-8-sig') as sentence: # 打开语料库
lst = []
while True:
line = sentence.readline()
if line == '':
break
else:
lst.append(line[:-1]) # 将语料库中内容存进列表并关闭文件
with open(path + '\\data\\corpus.dict.txt', 'r', encoding='utf-8-sig') as d: # 打开字典文件
dic = []
while True:
line = d.readline()
if line == '':
break
else:
dic.append(line[:-1]) # 将字典内容存入列表并关闭文件
num = int(dic[0].split('\t')[1]) # 获取最大长度
with open(path + '\\data\\corpus.out.txt', 'w', encoding='utf-8-sig') as out: # 创建.out文件
out_lst = []
with open(path + '\\data\\corpus.answer.txt', 'r', encoding='utf-8-sig') as answer: # 打开answer文件
ans_lst = []
while True: # 读入数据
line = answer.readline()
if line == '':
break
else:
ans_lst.append(line[:-1].split()) # 以空格隔开
suppose_all_num = 0 # 初始化应切分出的词的总数
for i in ans_lst: # 统计应切分出的词的总数
for j in i:
suppose_all_num += 1
for item in lst: # 循环语料库中每一句话
start = 0 # 设置启示位置
count = 10 # 设置最大长度
ans = []
while count >= 1: # 设置内循环
if count == start:
break
res = item[start:count] # 截取当前长度字符串
if res in dic: # 判断是否存在在字典中
ans.append(res)
start = count
if 10 + start < len(item): # 防止下标访问越界
count = 10 + start # 重设长度
else:
count = len(item)
else:
if count == start + 1: # 单个字符时处理
ans.append(item[start:count])
start = count
if 10 + start < len(item):
count = 10 + start
else:
count = len(item)
else: # 长度减一
count -= 1
out_lst.append(ans) # 将分词结果全部存入列表以备统计总数
result = ' '.join(ans)
out.write(result + '\n') # 将分词结果写入.out文件并换行
out_all_num = 0 # 初始化切分出的词的总数
for i in out_lst: # 统计切分出的词的总数
for j in i:
out_all_num += 1
wrong_ans = [] # 创建列表用于存放出错分词
right_num = 0 # 初始化正确切分出的词的数目
for i in range(0, len(out_lst)):
for j in out_lst[i]:
if j in ans_lst[i]:
right_num += 1
else:
wrong_ans.append(j) # 将错误分词放入列表
wrong_dic = {} # 统计错误次数
for i in wrong_ans:
wrong_dic[i] = wrong_dic.get(i, 0) + 1
precision = right_num / out_all_num # 计算正确率
recall = right_num / suppose_all_num # 计算召回率
f = precision * recall * 2 / (precision + recall) # 计算F值
with open(path + '\\data\\corpus.评价结果.txt', 'w', encoding='utf-8-sig') as judge: # 将PRF值输出到文件中
judge.write('正确率Precision = ' + str(right_num) + '/' + str(out_all_num) + ' = ' + '{0:.2f}'.format(
precision * 100) + '%' + '\n')
judge.write('召回率Recall = ' + str(right_num) + '/' + str(suppose_all_num) + ' = ' + '{0:.2f}'.format(
recall * 100) + '%' + '\n')
judge.write('F值 = ' + '{0:.2f}'.format(f * 100) + '%' + '\n')
judge.write('\n')
judge.write("错误分词及其出现次数为:\n") # 输出错误列表
for i in sorted(wrong_dic.items(), key=lambda ob: -ob[1]):
judge.write(i[0] + '\t' + str(i[1]) + '\n')
此方法效率低下,根据已知词典分词准确率较高