逆向最大匹配分词

最近学习Python 有学习任务 写一个逆向最大分词

import xlrd
import codecs
import os

#读取所有需要分词的文件路径
def eachFile(filepath):
    pathDir = os.listdir(filepath)
    paths_set = set()
    for i in pathDir:
        paths_set.add(os.path.join('/%s'%i))
    return paths_set
#读取文本内容
def readtxt(filepath):
    with open(filepath,'r',encoding='utf8') as f:
        sentences = f.readlines()
    f.close()
    return sentences
#将分词的结果存储
def writer_result(filepath,sentence):
    with codecs.open(filepath,'a',encoding='utf8') as w:
        w.write(sentence)
    w.close()

#读分词词典,词典中最长词长度
def get_seg_words(filepath):
    xl =xlrd.open_workbook(filepath)
    sheet = xl.sheet_by_index(0)
    words = sheet.col_values(1,1)
    max_index = 0
    word_dir = set()
    for word in words:
        word_dir.add(word)
        if len(word)>max_index:
            max_index = len(word)
    return word_dir,max_index
#读取停用词词典
def get_stop_words(filepath):
    xl = xlrd.open_workbook(filepath)
    sheet = xl.sheet_by_index(0)
    words = sheet.col_values(1, 1)
    stop_words = set()
    for word in words:
        stop_words.add(word)
    return stop_words


paths_set = eachFile('分词文本')

seg_words,max_index = get_seg_words(r'词表/words.xlsx')
stop_words = get_stop_words(r'词表/stopwords.xlsx')
for path in paths_set:
    print('begain : %s'%path)
    sentences = readtxt('分词文本'+path)
    for sentence in sentences:
        sentence = sentence.strip()
        start_index = 1
        end_index = len(sentence)
        result_sentence=''
        while start_index>0:
            for start_index in range(max(end_index-max_index,0),end_index,1):
                #print(sentence[start_index:end_index])
                if sentence[start_index:end_index] in stop_words:
                    break
                elif sentence[start_index:end_index] in seg_words or end_index == start_index+1:
                    str = sentence[start_index:end_index]
                    result_sentence=str+'/'+result_sentence
                    break
            end_index = start_index

        writer_result('result'+path,result_sentence)
        writer_result('result'+path,'\r\n')

转载于:https://my.oschina.net/u/3411375/blog/907259

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值