#encoding=utf-8
import sys
sys.path.append("../")
import jieba
import jieba.posseg as pseg
from jieba import analyse
#加载停用词表
stop = [line.strip().decode('utf-8') for line in open('stop_words.txt').readlines() ]
#导入自定义词典
jieba.load_userdict("userdict.txt")
# 读取文本
f = open('example.txt')
s = f.read()
#s="朝鲜半岛西北部古元古代高温变质-深熔作用:宏观和微观岩石学以及锆石U-Pb年代学制约"
#分词
segs = jieba.cut(s, cut_all=False)
#print u"[精确模式]: ", " ".join(segs)
#分词并标注词性
segs = pseg.cut(s)
final = ''
for seg ,flag in segs:
#去停用词
if seg not in stop:
#去数词和去字符串
if flag !='m' and flag !='x':
#输出分词
final +=' '+ seg
#输出分词带词性
# final +=' '+ seg+'/'+flag
print final
Python学习(二) 利用jieba分词及去停用词
最新推荐文章于 2024-07-05 14:19:56 发布