bag of words meet bag of props
第一步收集数据集
本案例使用的是kaggle数据集bag of words meet bag of props
第二步:清洗数据
导入需要的库
import pandas as pd
import logging
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',level=logging.INFO)
from bs4 import BeautifulSoup
导入数据 TODO
pd.set_option("display.max_columns",None,"display.max_colwidth",200)
df = pd.read_csv("E:\kaggle-word2vec-movie-reviews-master\data\labeledTrainData.tsv",sep="\t",escapechar="\\")
观察数据 TODO
print(df.info())
print(df.groupby("sentiment")["review"].count())
print(df.describe().round(2).T)
清洗数据
# 去除网络标签
df["review"] = df.review.apply(lambda x:BeautifulSoup(x,"html.parser").get_text())
# print(df["review"].head())
# 去除非a-zA-Z0-9?!,。等符号
df["review"] = df["review"].str.replace(r"[^a-zA-Z0-9,.'?!]"," ")
# print(df["review"].head())
# 大写将为小写
df["review"] = df["review"].str.lower()
print(df["review"].head())
# 保存数据
df.to_csv("bagofword.csv")
用nltk对数据进行分词,词形还原,词干提取
import nltk
from nltk.corpus import stopwords
# nltk TODO
# 缩写还原
df["review"] = df["review"].str.replace(r"i'm","i am")
df["review"] = df["review"].str.replace(r"i've","i have")
df["review"] = df["review"].str.replace(r"you're","you are")
df["review"] = df["review"].str.replace(r"you've","you have")
df["review"] = df["review"].str.replace(r"he's","he is")
df["review"] = df["review"].str.replace(r"she's","she is")
df["review"] = df["review"].str.replace(r"they're","they are")
df["review"] = df["review"].str.replace(r"wan't","want not")
df["review"] = df["review"].str.replace(r"don't","do not")
df["review"] = df["review"].str.replace(r"didn't","did not")
print(df["review"].head())
# 分词
df["token"] = df.review.apply(nltk.word_tokenize)
print(df["token"].head())
# 词形还原 stem
def stemer(text):
b = []
porter = nltk.PorterStemmer()
for w in text:
a = porter.stem(w)
b.append(a)
return b
df["token"] = df.token.apply(stemer)
print(df["token"].head())
# 词根提取 lemmer
def lemmer(text):
lemm = nltk.stem.WordNetLemmatizer()
b = []
for w in text:
a = lemm.lemmatize(w)
b.append(a)
return b
df["token"] = df.token.apply(lemmer)
print(df["token"].head())
# 去停用词
stop = stopwords.words("english")
# print(stop)
def remove(text):
a = [w for w in text if w not in stop]
return " ".join(a)
df["token"] = df.token.apply(remove)
# 保存数据
df.to_csv("bagofword.csv")
第三步:找到一个好的数据表示方式
导入数据
import pandas as pd
pd.set_option("display.max_columns",None,"display.max_colwidth",200)
df = pd.read_csv("bagofword.csv")
# print(df["review"].head())
# print(df["token"].head())
划分数据集
import nltk
# df["token"] = df.token.apply(nltk.word_tokenize)
list_corpus = df["token"].tolist()
# print(df["token"].head())
list_label = df["sentiment"].tolist()
from sklearn.model_selection import train_test_split
# 进行训练集划分 TODO
x_train,x_test,y_train,y_test = train_test_split(list_corpus,list_label,test_size=0.2,random_state=1)
一、one hot 独热编码(可视化嵌入)
进行onehot编码、划分训练集、训练
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
# 此处 data 为 text 文本,没有经过分词
def cv(data):
counter = CountVectorizer()
emb = counter.fit_transform(data)
return emb,counter
x_train,counter = cv(x_train)
x_test = counter.transform(x_test)
clf = LogisticRegression(penalty="l2",C=6,class_weight="balanced",n_jobs=-1,random_state=40,solver="saga")
clf.fit(x_train,y_train)
y_predict = clf.predict(x_test)
y_predict_train = clf.predict(x_train)
运行结果:result
x_train 准确度,查看是否有过拟合
[[9220 777]
[ 610 9393]]
the th precisoin:0.930770; accuracy:0.930650; recall:0.930650; f1:0.930645
x_test 准确度:
[[2192 311]
[ 281 2216]]
the th precisoin:0.881657; accuracy:0.881600; recall:0.881600; f1:0.881597
二、Tf-idf
进行tfidf 编码、训练
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
def tfidf(text):
tfidf = TfidfVectorizer()
emb = tfidf.fit_transform(text)
return emb,tfidf
x_train,tfidf = tfidf(x_train)
x_test = tfidf.transform(x_test)
clf = LogisticRegression(penalty="l2",C=1.5,class_weight="balanced",solver="saga",random_state=40,n_jobs=-1)
clf.fit(x_train,y_train)
y_predict = clf.predict(x_test)
y_predict_train = clf.predict(x_train)
运行结果:result
x_train 准确度,查看是否有过拟合
[[9312 685]
[ 515 9488]]
the precisoin:0.940127; accuracy:0.940000; recall:0.940000; f1:0.939996
x_test 准确度:
[[2209 294]
[ 245 2252]]
the precisoin:0.892353; accuracy:0.892200; recall:0.892200; f1:0.892191
三、word2vector
word2vector 模型训练
import nltk
from gensim.models import Word2Vec
df["token"] = df.token.apply(nltk.word_tokenize)
list_corpus = df["token"].tolist()
# print(df["token"].head())
model = Word2Vec(list_corpus,size=300,window=5,min_count=4,sample=1e-3,hs=1,sg=1)
model.save("bag.save")
利用训练好的word2vector进行分类训练
将句子转化成向量
# 将句子通过词向量求平均的方法代替句子
df["token"] = df.token.apply(nltk.word_tokenize)
list_label = df["sentiment"].tolist()
word2vector = Word2Vec.load("bag.save")
def avetage(text,size=300):
if len(text) <1:
return np.zeros(size)
a = [word2vector[w] if w in word2vector else np.zeros(size) for w in text]
length = len(a)
summed = np.sum(a,axis=0)
ave = np.divide(summed,length)
return ave
df["token"] = df.token.apply(avetage)
list_corpus = df["token"].tolist()
x_train,x_test,y_train,y_test = train_test_split(list_corpus,list_label,random_state=1,test_size=0.2)
# 进行分类训练
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(penalty="l2", C=30, class_weight="balanced", solver="saga", random_state=40, n_jobs=-1)
clf.fit(x_train,y_train)
y_predict = clf.predict(x_test)
y_predict_train = clf.predict(x_train)
运行结果:result 结果比onehot的要差,可能是训练集太小,训练出来的词向量有问题
x_train 准确度,查看是否有过拟合
[[8787 1210]
[1134 8869]]
the precisoin:0.882822; accuracy:0.882800; recall:0.882800; f1:0.882798
x_test 准确度:
[[2201 302]
[ 321 2176]]
the precisoin:0.875421; accuracy:0.875400; recall:0.875400; f1:0.875398
加载训练好的word2vector
word2vec_path = "D:\glove.twitter.27B.200d.bin
运行结果:result 结果比自己训练的的要差,可能数据集不相关,导致词向量表示有问题
x_train 准确度,查看是否有过拟合
[[7943 2054]
[1973 8030]]
the precisoin:0.798669; accuracy:0.798650; recall:0.798650; f1:0.798646
x_test 准确度:
[[2005 498]
[ 521 1976]]
the precisoin:0.796223; accuracy:0.796200; recall:0.796200; f1:0.796195
交叉验证防止过拟合
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score
kf = KFold(n_splits=5,random_state=1,shuffle=False)
precision1 = []
for train,test in kf.split(x_train):
clf = LogisticRegression(penalty="l2", C=6, class_weight="balanced", n_jobs=-1, random_state=40, solver="saga")
clf.fit(x_train[train[0]:train[-1]],y_train[train[0]:train[-1]])
y_pre = clf.predict(x_train[test[0]:test[-1]])
pre = precision_score(y_train[test[0]:test[-1]],y_pre)
precision1.append(pre)
y_predict_train = np.sum(precision1)/len(precision1)
运行结果:result 可发现kfold 确实能够防止过拟合问题
x_train 准确度,查看是否有过拟合
[[2192 311]
[ 281 2216]]
precisoin:0.881657; accuracy:0.881600; recall:0.881600; f1:0.881597
x_test 准确度:
[[9220 777]
[ 610 9393]]
precisoin:0.930770; accuracy:0.930650; recall:0.930650; f1:0.930645
第四步:理解解释模型
调用 confusion matrix 进行模型评估
from sklearn.metrics import precision_score,accuracy_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix
# 模型评估 confuse matrix accuracy recall presicion f1
def score_matrix(y_test,y_predicted):
precision = precision_score(y_test,y_predicted,pos_label=None,average="weighted")
accuracy = accuracy_score(y_test,y_predicted)
recall = recall_score(y_test,y_predicted,pos_label=None,average="weighted")
f1 = f1_score(y_test,y_predicted,pos_label=None,average="weighted")
return precision,accuracy,recall,f1
precision,accuracy,recall,f1 = score_matrix(y_test,y_predict)
# 训练集预测结果
cm = confusion_matrix(y_train,y_predict_train)
print(cm)
print("the th precisoin:%2f; accuracy:%2f; recall:%2f; f1:%2f"%(precision,accuracy,recall,f1))
# 测试集结果
cm = confusion_matrix(y_test,y_predict)
print(cm)
print("the th precisoin:%2f; accuracy:%2f; recall:%2f; f1:%2f"%(precision,accuracy,recall,f1))
precision,accuracy,recall,f1 = score_matrix(y_train,y_predict_train)