nlp 案例:bag of words meet bag of props

bag of words meet bag of props


第一步收集数据集


本案例使用的是kaggle数据集bag of words meet bag of props

第二步:清洗数据


导入需要的库

import pandas as pd
import logging
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s',level=logging.INFO)
from bs4 import BeautifulSoup

导入数据 TODO

pd.set_option("display.max_columns",None,"display.max_colwidth",200)
df = pd.read_csv("E:\kaggle-word2vec-movie-reviews-master\data\labeledTrainData.tsv",sep="\t",escapechar="\\")

观察数据 TODO

print(df.info())
print(df.groupby("sentiment")["review"].count())
print(df.describe().round(2).T)

清洗数据

# 去除网络标签
df["review"] = df.review.apply(lambda x:BeautifulSoup(x,"html.parser").get_text())
# print(df["review"].head())
# 去除非a-zA-Z0-9?!,。等符号
df["review"] = df["review"].str.replace(r"[^a-zA-Z0-9,.'?!]"," ")
# print(df["review"].head())
# 大写将为小写
df["review"] = df["review"].str.lower()
print(df["review"].head())
# 保存数据
df.to_csv("bagofword.csv")

用nltk对数据进行分词,词形还原,词干提取

import nltk
from nltk.corpus import stopwords
# nltk TODO
# 缩写还原
df["review"] = df["review"].str.replace(r"i'm","i am")
df["review"] = df["review"].str.replace(r"i've","i have")
df["review"] = df["review"].str.replace(r"you're","you are")
df["review"] = df["review"].str.replace(r"you've","you have")
df["review"] = df["review"].str.replace(r"he's","he is")
df["review"] = df["review"].str.replace(r"she's","she is")
df["review"] = df["review"].str.replace(r"they're","they are")
df["review"] = df["review"].str.replace(r"wan't","want not")
df["review"] = df["review"].str.replace(r"don't","do not")
df["review"] = df["review"].str.replace(r"didn't","did not")
print(df["review"].head())
# 分词
df["token"] = df.review.apply(nltk.word_tokenize)
print(df["token"].head())
# 词形还原 stem
def stemer(text):
    b = []
    porter = nltk.PorterStemmer()
    for w in text:
        a = porter.stem(w)
        b.append(a)
    return b
df["token"] = df.token.apply(stemer)
print(df["token"].head())
# 词根提取 lemmer
def lemmer(text):
    lemm = nltk.stem.WordNetLemmatizer()
    b = []
    for w in text:
        a = lemm.lemmatize(w)
        b.append(a)
    return b
df["token"] = df.token.apply(lemmer)
print(df["token"].head())
# 去停用词
stop = stopwords.words("english")
# print(stop)
def remove(text):
    a = [w for w in text if w not in stop]
    return " ".join(a)
df["token"] = df.token.apply(remove)
# 保存数据
df.to_csv("bagofword.csv")

第三步:找到一个好的数据表示方式


导入数据
import pandas as pd
pd.set_option("display.max_columns",None,"display.max_colwidth",200)
df = pd.read_csv("bagofword.csv")
# print(df["review"].head())
# print(df["token"].head())
划分数据集
import nltk
# df["token"] = df.token.apply(nltk.word_tokenize)
list_corpus = df["token"].tolist()
# print(df["token"].head())
list_label = df["sentiment"].tolist()
from sklearn.model_selection import train_test_split
# 进行训练集划分 TODO
x_train,x_test,y_train,y_test = train_test_split(list_corpus,list_label,test_size=0.2,random_state=1)	
一、one hot 独热编码(可视化嵌入)

进行onehot编码、划分训练集、训练

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer
# 此处 data 为 text 文本,没有经过分词
def cv(data):
    counter = CountVectorizer()
    emb = counter.fit_transform(data)
    return emb,counter
x_train,counter = cv(x_train)
x_test = counter.transform(x_test)
clf = LogisticRegression(penalty="l2",C=6,class_weight="balanced",n_jobs=-1,random_state=40,solver="saga")
clf.fit(x_train,y_train)
y_predict = clf.predict(x_test)
y_predict_train = clf.predict(x_train)	

运行结果:result

x_train 准确度,查看是否有过拟合
[[9220  777]
 [ 610 9393]]
the th precisoin:0.930770; accuracy:0.930650; recall:0.930650; f1:0.930645	

x_test 准确度:
[[2192  311]
[ 281 2216]]
the th precisoin:0.881657; accuracy:0.881600; recall:0.881600; f1:0.881597
二、Tf-idf

进行tfidf 编码、训练

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
def tfidf(text):
    tfidf = TfidfVectorizer()
    emb = tfidf.fit_transform(text)
    return  emb,tfidf
x_train,tfidf = tfidf(x_train)
x_test = tfidf.transform(x_test)
clf = LogisticRegression(penalty="l2",C=1.5,class_weight="balanced",solver="saga",random_state=40,n_jobs=-1)
clf.fit(x_train,y_train)
y_predict = clf.predict(x_test)
y_predict_train = clf.predict(x_train)

运行结果:result

x_train 准确度,查看是否有过拟合
[[9312  685]
 [ 515 9488]]

the precisoin:0.940127; accuracy:0.940000; recall:0.940000; f1:0.939996	
x_test 准确度:
[[2209  294]
 [ 245 2252]]
the precisoin:0.892353; accuracy:0.892200; recall:0.892200; f1:0.892191
三、word2vector
word2vector 模型训练
import nltk
from gensim.models import Word2Vec
df["token"] = df.token.apply(nltk.word_tokenize)
list_corpus = df["token"].tolist()
# print(df["token"].head())

model = Word2Vec(list_corpus,size=300,window=5,min_count=4,sample=1e-3,hs=1,sg=1)
model.save("bag.save")

利用训练好的word2vector进行分类训练

将句子转化成向量

# 将句子通过词向量求平均的方法代替句子
df["token"] = df.token.apply(nltk.word_tokenize)
list_label = df["sentiment"].tolist()
word2vector = Word2Vec.load("bag.save")
def avetage(text,size=300):
    if len(text) <1:
        return np.zeros(size)
    a = [word2vector[w] if w in word2vector else np.zeros(size) for w in text]
    length = len(a)
    summed = np.sum(a,axis=0)
    ave = np.divide(summed,length)
    return ave
df["token"] = df.token.apply(avetage)
list_corpus = df["token"].tolist()
x_train,x_test,y_train,y_test = train_test_split(list_corpus,list_label,random_state=1,test_size=0.2)
# 进行分类训练
from sklearn.linear_model import  LogisticRegression
clf = LogisticRegression(penalty="l2", C=30, class_weight="balanced", solver="saga", random_state=40, n_jobs=-1)
clf.fit(x_train,y_train)
y_predict = clf.predict(x_test)
y_predict_train = clf.predict(x_train)

运行结果:result 结果比onehot的要差,可能是训练集太小,训练出来的词向量有问题

x_train 准确度,查看是否有过拟合
[[8787 1210]
 [1134 8869]]
the precisoin:0.882822; accuracy:0.882800; recall:0.882800; f1:0.882798

x_test 准确度:
[[2201  302]
 [ 321 2176]]
the precisoin:0.875421; accuracy:0.875400; recall:0.875400; f1:0.875398
加载训练好的word2vector
word2vec_path = "D:\glove.twitter.27B.200d.bin

运行结果:result 结果比自己训练的的要差,可能数据集不相关,导致词向量表示有问题

x_train 准确度,查看是否有过拟合
[[7943 2054]
 [1973 8030]]
the precisoin:0.798669; accuracy:0.798650; recall:0.798650; f1:0.798646
x_test 准确度:
[[2005  498]
 [ 521 1976]]
the precisoin:0.796223; accuracy:0.796200; recall:0.796200; f1:0.796195
交叉验证防止过拟合
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score
kf = KFold(n_splits=5,random_state=1,shuffle=False)
precision1 = []
for train,test in kf.split(x_train):
    clf = LogisticRegression(penalty="l2", C=6, class_weight="balanced", n_jobs=-1, random_state=40, solver="saga")
    clf.fit(x_train[train[0]:train[-1]],y_train[train[0]:train[-1]])
    y_pre = clf.predict(x_train[test[0]:test[-1]])
    pre = precision_score(y_train[test[0]:test[-1]],y_pre)
	precision1.append(pre)
	y_predict_train = np.sum(precision1)/len(precision1)

运行结果:result 可发现kfold 确实能够防止过拟合问题

 x_train 准确度,查看是否有过拟合
[[2192  311]
  [ 281 2216]]
 precisoin:0.881657; accuracy:0.881600; recall:0.881600; f1:0.881597
x_test 准确度:
 [[9220  777]
  [ 610 9393]]
 precisoin:0.930770; accuracy:0.930650; recall:0.930650; f1:0.930645

第四步:理解解释模型


调用 confusion matrix 进行模型评估

from sklearn.metrics import precision_score,accuracy_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix

# 模型评估 confuse matrix accuracy recall presicion f1
def score_matrix(y_test,y_predicted):
    precision = precision_score(y_test,y_predicted,pos_label=None,average="weighted")
    accuracy = accuracy_score(y_test,y_predicted)
    recall = recall_score(y_test,y_predicted,pos_label=None,average="weighted")
    f1 = f1_score(y_test,y_predicted,pos_label=None,average="weighted")
    return precision,accuracy,recall,f1
precision,accuracy,recall,f1 = score_matrix(y_test,y_predict)
# 训练集预测结果	
cm = confusion_matrix(y_train,y_predict_train)
print(cm)
print("the th precisoin:%2f; accuracy:%2f; recall:%2f; f1:%2f"%(precision,accuracy,recall,f1))
# 测试集结果
cm = confusion_matrix(y_test,y_predict)
print(cm)
print("the th precisoin:%2f; accuracy:%2f; recall:%2f; f1:%2f"%(precision,accuracy,recall,f1))
precision,accuracy,recall,f1 = score_matrix(y_train,y_predict_train)
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值