1 import jieba 2 import sys 3 import jieba.analyse 4 from optparse import OptionParser 5 tfidf = jieba.analyse.extract_tags 6 7 with open('lagoujobdatails.txt',encoding='utf-8') as f: 8 tmp_line=f.read() 9 jieba_cut=jieba.cut(tmp_line) 10 ans=''.join(jieba_cut) 11 text=ans 12 k=tfidf(text) 13 for i in k: 14 print(i)
['爬虫', '熟悉', '抓取', '经验', '开发', '网页', '优先', '数据', '负责', 'spanclass', '精通', '分布式', '技术', '系统', '算法', '岗位职责', '工作', '网络', '设计', '编程']
其中"爬虫‘,’spanclass','岗位职责‘是垃圾数据,数据清理时没清理完