【NLP】NO4:文本分类

在这里插入图片描述

import pandas as pd
import random
import jieba
import pandas as pd
#加载停用词,txt内容可以随项目进行改变
stopwords = pd.read_csv('stopword.txt',index_col=False,quoting=3,sep='\t',names=['stopwords'],encoding='utf-8')
stopwords = stopwords['stopwords'].values
#加载语料库
data = pd.read_csv('data.csv',encoding='utf-8',seq=',')
data.dropna(inplace=True)
data = data.segment.values.tolist()#dataframe转为list
#分词、去停用词
def preprocess(data):
	for line in data:
		try:
			segs = jieba.lcut(line)  #分词
			segs = [v for v in segs if not str(v).isdigit()]#取数字
			segs = list(filter(lambda x:x.strip(),segs)) #去左右空格
			segs = list(filter(lambda x:len(x)>1,segs)) #去掉长度为1的字符
			segs = list(filter(lambda x:x not in stopwords,segs)) #去掉停用词
			sentences.append("".join(segs))
		except Exception:
			print(line)
			continue
	return sentences
segs = preprocess(data)
#抽取词向量特征
#抽取特征,定义词袋模型,可以换成word2vec模型
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(analyzer='word',max_features=4000)  
#语料数据切分成训练集和测试集
for sklearn.model_selection import train_test_split
x_train, x_test = train_test_split(data,random_state=42)
#训练数据转为词袋模型
vec.fit(x_train)
#建立模型,可以换多种训练模型
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(vec.transform(x_train), y_train)
#计算auc
print(classifier.score(vec.transform(x_test), y_test))
#预测
pre = classifier.predict(vec.transform(x_test))
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页