# -*- coding: utf-8 -*-
import jieba
import re
def detailrearrange(line_jieba):
after_rerange = []
digit = re.compile('^[0-9]+(.[0-9]{1,3})?$')
zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
for i in line_jieba:
if digit.match(i):
temp = next(line_jieba)
if zhPattern.search(temp) and temp != '或':
after_rerange.append(str(i) + temp)
else:
after_rerange.append(i)
after_rerange.append(temp)
else:
after_rerange.append(i)
lineafter = " ".join(after_rerange)
return lineafter
def words_divide(readpath, savepath):
f = open(readpath, encoding='utf-8')
lines = f.readlines()
f.close()
jieba.load_userdict("词库")
f = open(savepath, "a+")
for line in lines:
line_jieba = jieba.cut(line, cut_all=False, HMM=True)
rerangeline = detailrearrange(line_jieba)
f.write(rerangeline)
f.close()
readpath = '打开文件'
savepath = '存储文件'
words_divide(readpath, savepath)
本文的词库文件需要自己建立词库,或者空白文件也是可以的
打开文件就是需要进行jieba处理的文件用其进行分词、剔除、扩充词库的文本
存储文件就是用来显示其jieba分词后的文件,此时的文件是剔除特殊字符串的文件
注意:如果出现编码有问题,可以打开相应的文件的记事本将其另存为时更改编码类型
将ANSI编码方式改为utf8即可