1、分词,打标签;
2、特征选择: 卡方检验
def chi_select():
#构建停用词表
stopwords=[]
with open("../hlt_stop_words.txt","r") as stopword:
for line in stopword: #遍历文件,一行行遍历,读取文本
rs = line.replace('\n', '')
stopwords.append(rs)
#读入文本
x_text = 1_examples + 2_examples + ...
# 去停用词
x_stop=[]
for word in x_text:
# word = word.split(" ")
rs = []
for _ in word:
if _ not in stopwords:
rs.append(_)
x_stop.append(rs)
#重组
x_final=[]
for i in x_stop:
x_final.append(str(i))
#建立词典
max_document_length = max([len(x) for x in x_stop])
#print(max_document_length)
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_final)