#jieba分词示例defget_content(path):withopen(path,'r', encoding='gbk', errors='ignore')as f:
content =''for l in f:
l = l.strip()
content += l
return content
defget_TF(words, topK=10):
tf_dic ={}for w in words:
tf_dic[w]= tf_dic.get(w,0)+1returnsorted(tf_dic.items(), key =lambda x: x[1], reverse=True)[:topK]defstop_words(path):withopen(path,encoding='UTF-8')as f:return[l.strip()for l in f]
stop_words('data/stop_words.utf8')#分词defmain():import glob
import random
import jieba
files = glob.glob('data/news/C000013/*.txt')#查找符合特定规则的文件路径名
corpus =[get_content(x)for x in files[:5]]
sample_inx = random.randint(0,len(corpus))
sample_inx =3import jieba.posseg as psg
split_words =[x for x in jieba.cut(corpus[sample_inx])if x notin stop_words('./data/stop_words.utf8')]print('样本之一:'+corpus[sample_inx])print('样本分词效果:'+'/ '.join(split_words))print('样本的topK(10)词:'+str(get_TF(split_words)))
main()
#jieba分词示例def get_content(path): with open(path, 'r', encoding='gbk', errors='ignore') as f: content = '' for l in f: l = l.strip() content += l return contentdef get_TF(words, topK=10): t