# -*- coding: utf-8 -*-
import numpy as np
import os
import time
from collections import defaultdict
test_dic="" #词典文件路径
doc_test="" #分类样本文件夹路径
doc_test_dic="" #存放分词后文档的路径
time_ori = time.time()
#处理词典库获得列表
def get_dic(test_dic):
with open(test_dic,'r',encoding = 'utf-8') as f:
try:
file_open=f.read().split()
finally:
f.close()
chars = list(set(file_open))
d = defaultdict(int)
for i in chars:
d[i] = 0
return d
# 注意用词典,用列表会很慢
dic = get_dic(test_dic)
def readfile():
MAX_LEN=0
for i in dic:
MAX_LEN=max(MAX_LEN,len(i))##获得最大长度
#print(MAX_LEN)
filelist = os.listdir(doc_test)
count = 1
for file in filelist:
#print(count)
test_src = os.path.join(os.path.abspath(doc_test),file)
test_dst = os.path.join(os.path.abspath(doc_test_dic),str(count)+'.txt') #已分词的文档名字,此处就用简单的count作为序号
f=open(test_src,'r',encoding = 'utf-8')
f_MM=open(test_dst,'w',encoding = 'utf-8')
lines=f.readlines()
f.close()
for line in lines:
#分别对每一行进行正向最大匹配处理
max_length=MAX_LEN
MM_list=[]
len_hang=len(line)
while len_hang>0:
temp=line[0:max_length]
#以最大长度切割字符串
while temp not in dic:
#查找切割串是否在字典中
if len(temp)==1:
break;
#切割串长度为一时退出
temp=temp[0:len(temp)-1]
#否则长度减一
MM_list.append(temp)
line=line[len(temp):]
len_hang=len(line)
for i in MM_list:
if i == '\n':
f_MM.write('\n')
else:
f_MM.write(i+" ")
f_MM.close()
count += 1
time01 = time.time()
print("第"+str(count)+"个文本,time="+str(time01-time_ori))
readfile()
Python/中文分词/最大正向匹配分词
最新推荐文章于 2023-03-09 10:06:50 发布