Python/中文分词/最大正向匹配分词

最新推荐文章于 2023-03-09 10:06:50 发布

咸鸭蛋黄月饼

最新推荐文章于 2023-03-09 10:06:50 发布

阅读量341

点赞数

本文链接：https://blog.csdn.net/weixin_45746839/article/details/117267310

版权

# -*- coding: utf-8 -*-

import numpy as np
import os
import time
from collections import defaultdict

test_dic="" #词典文件路径
doc_test="" #分类样本文件夹路径
doc_test_dic="" #存放分词后文档的路径

time_ori = time.time()

#处理词典库获得列表
def get_dic(test_dic):
    with open(test_dic,'r',encoding = 'utf-8') as f:
        try:
            file_open=f.read().split()
        finally:
            f.close()
    chars = list(set(file_open))
    d = defaultdict(int)
    for i in chars:
        d[i] = 0
    return d
    # 注意用词典，用列表会很慢

dic = get_dic(test_dic)

def readfile():
    MAX_LEN=0
    for i in dic:
        MAX_LEN=max(MAX_LEN,len(i))##获得最大长度
    #print(MAX_LEN)

    filelist = os.listdir(doc_test)
    count = 1
    for file in filelist:
        #print(count)
        test_src = os.path.join(os.path.abspath(doc_test),file)
        test_dst = os.path.join(os.path.abspath(doc_test_dic),str(count)+'.txt') #已分词的文档名字，此处就用简单的count作为序号
        f=open(test_src,'r',encoding = 'utf-8')
        f_MM=open(test_dst,'w',encoding = 'utf-8')
        lines=f.readlines()
        f.close()
        for line in lines:
        #分别对每一行进行正向最大匹配处理
            max_length=MAX_LEN
            MM_list=[]
            len_hang=len(line)
            while len_hang>0:
                temp=line[0:max_length]
                #以最大长度切割字符串
                while temp not in dic:
                    #查找切割串是否在字典中
                    if len(temp)==1:
                        break;
                        #切割串长度为一时退出
                    temp=temp[0:len(temp)-1]
                    #否则长度减一
                MM_list.append(temp)
                line=line[len(temp):]
                len_hang=len(line)
            for i in MM_list:
                if i == '\n':
                    f_MM.write('\n')
                else:
                    f_MM.write(i+" ")
        f_MM.close()
        count += 1
        time01 = time.time()
        print("第"+str(count)+"个文本，time="+str(time01-time_ori))
readfile()

咸鸭蛋黄月饼

关注

0
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
Python/中文分词/最大正向匹配分词

# -*- coding: utf-8 -*-import numpy as npimport osimport timefrom collections import defaultdicttest_dic="" #词典文件路径doc_test="" #分类样本文件夹路径doc_test_dic="" #存放分词后文档的路径time_ori = time.time()#处理词典库获得列表def get_dic(test_dic): with open(test_dic,
复制链接

扫一扫