最近在学习关于情感分类的知识,以下是最近学习的一些知识总结
一.主要步骤
1.选定数据集及对数据集的处理
这里我选用的是大众点评的数据集,数据集中包含了对菜品的打分,分数超过3的我规定为认可,小于3的为不认可,这样就给数据集加上了标签。
import pandas as pa
data=pa.read_csv("E:\\data.csv")
#定义添加标签的函数
def lable(scores):
if scores>3:
return 1
else:
return 0
#在数据集上在添加一列 lable
data["labal"]=data.lscores.apply(label)
2.使用正则表达式对句子中的标点符号,特殊字符的清除
import regex as re
r=r = "[^a-zA-Z_.!+-=——,$%^,! !。?、:? ~@#¥%……&*《》<>「」{}【】()/\\\[\]'\"]"
with open ("E:\\data.csv",encoding="utf-8") as line_raw:
for line in line_raw:
line=line.split('\t')
line=re.sub(r,'',line)
3.读取无用词
stop_word=[]
with open("E://stopword")as words:
for word in words:
word = words.replace('\n','').replace('\t','').strip()
stop_word.append(word)
stop_word=set(stop_word)
4.对文本进行结巴分词且去除无用词
last_word=''
import jieba
for scentence in line:
words=jieba.cut(scentence,cut_all=False)
for word in words:
if word not in stop_word:
if word !='\t':
last_word+=word
last_word=" "
data['cut_comment'] = data.comment.apply(.join(jieba.cut(mytext)))
X = data['cut_comment'] #X为划分之后的句子
y = data.lable #y标签为0或1
#划分训练集和测试集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)
5.形成词袋模型使用CountVectorizer
vect = CountVectorizer(max_df = 0.8, #构建词袋数据结构
min_df = 3,
stop_words=frozenset(stop_word))
6.训练模型
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
X_train_vect = vect.fit_transform(X_train)
X_test_vect = vect.transform(X_test)