一、中文分词Xmind
二、结巴分词
import glob
import jieba
def get_content(path):
with open(path,'r',encoding='utf-8',errors='ignore') as f:
content=''
for l in f:
content=content+l.strip()
return content
def get_TF(words,topK=10):
tf_dic={}
for i in words:
tf_dic[i]=tf_dic.get(i,0)+1 #tf_dic.get(w,0) 取w值并设为默认值0
return sorted(tf_dic.items(),key=lambda x:x[1],reverse=True)[:topK] #tf_dic.items() 改为可遍历元组
def stop_words(path):
with open(path,'r',encoding='utf-8',errors='ignore') as f:
words=[x.strip() for x in f]
words.append(' ')
return words
def main():
files=glob.glob('C:\\Users\\11728\\Desktop\\美团\\data\\*.txt') #或plob.iplob 选取所有符合的路径
txt=[get_content(path) for path in files][0]
#jieba.add_word() #添加自定义词
#jieba.load_userdict('add_words.txt') #添加自定义词库utf-8 词语 词频 词性 如:优秀 5 a
n=[x for x in jieba.cut(txt) if x not in stop_words('stop_words.txt')]
#y=[',','很','好',' ','可以']
#n=[x for x in jieba.cut(txt) if x not in y]
print(get_TF(n))
main()
分词并显示排名前十:
[('舒服', 479), ('穿', 355), ('质量', 259), ('收到', 256), ('不错', 245), ('内衣', 231), ('穿着', 226), ('买', 190), ('喜欢', 183), ('宝贝', 173)]