N=4774defstop_words():
stop_words_file =open('stop_words_ch.txt','r')
stopwords_list =[]for line in stop_words_file.readlines():
stopwords_list.append(line.decode('gbk')[:-1])return stopwords_list
使用结巴分词把文件进行切分
import jieba
defjieba_fenci(raw, stopwords_list):
word_list =list(jieba.cut(raw, cut_all=False))for word in word_list:if word in stopwords_list:
word_list.remove(word)# word_set用于统计A[nClass]
word_list.remove('\n')
word_set =set(word_list)return word_list, word_set
读取停词表N=4774def stop_words(): stop_words_file = open('stop_words_ch.txt', 'r') stopwords_list = [] for line in stop_words_file.readlines(): stopwords_list.append(line.decode('gbk'...