IMDB评论数据进行情感分析
情感分析有很多的应用场景,比如做一个电商网站,卖家需要时刻关心用户对于商品的评论是否是正面的。再比如做一个电影的宣传和策划,电影在键盘侠们中的口碑也至关重要。互联网上关于任何一个事件或物品都有可能产生成千上万的文本评论,如何定义每一个文本的情绪是正面或是负面的,是一个很有挑战的事情。tensorflow进行情感分析
kaggle上的IMDB分类:
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense , Input , LSTM , Embedding, Dropout , Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.models import load_model
train = pd.read_csv('data/labeledTrainData.tsv', header=0, delimiter="\t", quoting=3)
df_train1 = pd.read_csv('data/imdb_master.csv',encoding='latin-1')
# print(train.columns.values)
# print(train_1.columns.values)
df_train1=df_train1.drop(["type",'file'],axis=1)
df_train1.rename(columns={'label':'sentiment','Unnamed: 0':'id','review':'review'}, inplace=True)
# print(df_train1.head(10))
# print(df_train1['sentiment'].unique())
#去掉unsup的评论
df_train1 = df_train1[df_train1.sentiment != 'unsup']
maping = {'pos': 1, 'neg': 0}
df_train1['sentiment'] = df_train1['sentiment'].map(maping)
#训练数据:
new_train=pd.concat([train,df_train1])
#new_train.to_csv('data/new_train.csv')
#new_train=pd.read_csv('data/new_train.csv')
#测试数据:
df_test=pd.read_csv("data/testData.tsv",header=0, delimiter="\t", quoting=3)
#数据清理:
def review_to_words(review):
# 去掉HTML标签,拿到内容
review_text = BeautifulSoup(review, "lxml").get_text()
# 用正则表达式取出符合规范的部分
review_text = re.sub("[^a-zA-Z]"," ", review_text)
# 小写化所有的词,并转成词list
words = review_text.lower().split()
stops = set(stopwords.words("english"))
meaningful_words = [w for w in words if not w in stops]
return " ".join(meaningful_words)
new_train['review']=new_train['review'].apply(review_to_words)
df_test["review"]=df_test["review"].apply(review_to_words)
print(new_train.isnull().sum(),df_test.isnull().sum())
#构建模型训练
y=new_train['sentiment'].values
list_sentences_train = new_train["review"]
list_sentences_test = df_test["review"]
#文本预处理https://blog.csdn.net/lovebyz/article/details/77712003
max_features = 6000
tokenizer = Tokenizer(num_words=6000) #对文本中的词进行统计计数,生成文档词典,以支持基于词典位序生成文本的向量表示。
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)
maxlen = 370
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen) #maxlen:为序列的最大长度。大于此长度的序列将被截短,小于此长度的序列将在后部填0.
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)
inp = Input(shape=(maxlen, ))
embed_size = 128
x = Embedding(max_features, embed_size)(inp) #https://www.cnblogs.com/fujian-code/p/8967340.html
x = LSTM(60, return_sequences=True,name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(1, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
optimizer='adam',metrics=['accuracy'])
batch_size = 32
epochs = 6
model.fit(X_t,y, batch_size=batch_size, epochs=epochs, validation_split=0.2)
model.save('data/model1.h5')
#加载模型
model = load_model('my_model0.h5')
prediction = model.predict(X_te)
y_pred = (prediction > 0.5)
df_test["sentiment"] = df_test["id"].map(lambda x: 1 if int(x.strip('"').split("_")[1]) >= 5 else 0)
y_test = df_test["sentiment"]
from sklearn.metrics import f1_score, confusion_matrix
print('F1-score: {0}'.format(f1_score(y_pred, y_test)))
print('Confusion matrix:')
confusion_matrix(y_pred, y_test)
#输出提交数据
# df_test = df_test[['id','sentiment']]
# df_test.to_csv("submission.csv",index=False)