import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from gensim.models.word2vec import Word2Vec
import tqdm
import tensorflow as tf
print(tf.__version__)
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential as Sequential
import nltk
from nltk.corpus import stopwords
2.0.0
# set config
BATCH_SIZE = 64
SEQUENCE_LENGTH = 22
WORD_SIZE = 200
log
1:w2v + 1 layer lstm : 0.77(valid data)
2:w2v + 2 layer lstm : 0.77
3:w2v + 4 layer lstm : 0.77
4:w2v + 1 layer bi-lstm : 0.76
5:w2v + 4 layer bi-lstm : 0.77
6:w2v + 4 layer bi-lstm + dropout(0.5 层间): 0.77
7:w2v + 4 layer bi-lstm + dropout(0.5 层间): 0.77
8:w2v + 8 layer bi-lstm + dropout(0.5 层间): 0.76
9:w2v + 3 layer bi-lstm + dropout(0.4 input gate 之前):0.76
10:w2v + 3 layer bi-lstm + dropout(0.3 input gate 之前):0.76
11:w2v + 3 layer bi-lstm + dropout(0.3 input gate 之前) + batch = 32(加倍batch):0.76
12:w2v + 3 layer bi-lstm + dropout(0.1 input gate 之前) + batch = 64 :0.76
小结:bidirection 在这个案例中无效
#load data
path_home = "/home/lowry/pro/kaggle_tweets/kaggle_tweets_emotion"
data_train = pd.read_csv(os.path.join(path_home,"train.csv"))
data_test = pd.read_csv(os.path.join(path_home,"test.csv"))
data_submit = pd.read_csv(os.path.join(path_home,"sample_submission.csv"))
data_train.head()
id | keyword | location | text | target | |
---|---|---|---|---|---|
0 | 1 | NaN | NaN | Our Deeds are the Reason of this #earthquake M... | 1 |
1 | 4 | NaN | NaN | Forest fire near La Ronge Sask. Canada | 1 |
2 | 5 | NaN | NaN | All residents asked to 'shelter in place' are ... | 1 |
3 | 6 | NaN | NaN | 13,000 people receive #wildfires evacuation or... | 1 |
4 | 7 | NaN | NaN | Just got sent this photo from Ruby #Alaska as ... | 1 |
# data clearn
stopwords_english = stopwords.words("english")
import re
def cleanword(s):
s = s.lower()
s = " ".join([word for word in s.split(" ") if word not in stopwords_english])
temp = re.findall("http\S*",s)
for deletStr in temp:
if deletStr != "":
s = s.replace(deletStr," ")
temp = re.findall("@\S*",s)
for deletStr in temp:
if deletStr != "":
s = s.replace(deletStr," ")
temp = re.findall("\d*",s)
for deletStr in temp:
if deletStr != "":
s = s.replace(deletStr," ")
temp = re.findall("\x89\S*",s)
for deletStr in temp:
if deletStr != "":
s = s.replace(deletStr[:5]," ")
s = s.replace("\n"," ")
s = s.replace(","," ")
s = s.replace("?"," ")
s = s.replace("..."," ")
s = s.replace("."," ")
s = s.replace("["," ")
s = s.replace("]"," ")
s = s.replace("!"," ")
s = s.replace(":"," ")
s = s.replace("-"," ")
s = s.replace("#"," ")
s = s.replace("|"," ")
s = s.replace("("," ")
s = s.replace(")"," ")
s = s.replace(";"," ")
s = s.replace("="," ")
s = s.replace(">"," ")
s = s.replace("<"," ")
s = s.replace("/"," ")
#delet conntinue " "
s_new_list = [word for word in s.split(" ") if word != " "]
s_new = " ".join(s_new_list)
s_new = s_new.strip()
return s_new
data_test['text'] = data_test['text'].apply(cleanword)
data_train['text'] = data_train['text'].apply(cleanword)
data_train_copy = data_train.copy()
data_train_copy["text_length"] = data_train_copy["text"].apply(lambda s : len(s.split(" ")))
data_test_copy = data_train.copy()
data_test_copy["text_length"] = data_train_copy["text"].apply(lambda s : len(s.split(" ")))
print("train_text_words_length(95%) = ",np.percentile(data_train_copy["text_length"].tolist(),95))
print("test_text_words_length(95%) = ",np.percentile(data_test_copy["text_length"].tolist(),95))
train_text_words_length(95%) = 22.0
test_text_words_length(95%) = 22.0
# print(data_train_copy)
# print(data_train_copy.info(memory_usage=True))
1
#train word2vec model
# sentences = []
# for line in data_train['text'].values:
# sentences.append(list(line.split(" ")))
# for line in data_test['text'].values:
# sentences.append(list(line.split(" ")))
# print(len(sentences))
# path_model_w2v = os.path.join(path_home,"w2v_model.model")
# model_w2v = Word2Vec(
# sentences=sentences,
# size=200,#维度
# alpha=0.025, #默认
# window=5, #默认
# min_count=2,#2,3
# sample=0.001,#
# seed=2018, #
# workers=11, #线程
# min_alpha=0.0001,
# sg=0, #cbow
# hs=0, #负采样
# negative=5,#负采样个数
# ns_exponent=0.75,
# cbow_mean=1,#求和再取平均
# iter=10 #10到15
# )
# model_w2v.save(path_model_w2v)
# test:
# model_w2v = Word2Vec.load(path_model_w2v)
# print(model_w2v)
# model_w2v.wv["our"].shape
# for word , wordInfo in model_w2v.wv.vocab.items():
# print("word = ",word)
# print("wordInfo = ",wordInfo )
# break
10876
#build word2vec map
path_model_w2v = os.path.join(path_home,"w2v_model.model")
model_w2v = Word2Vec.load(path_model_w2v)
vocab_list = [word for word, Vocab in model_w2v.wv.vocab.items()]# 存储 所有的 词语
word_index = {" ": 0}# 初始化 `[word : token]` ,后期 tokenize 语料库就是用该词典 (把词语改成下表)。
word_vector = {} # 初始化`[word : vector]`字典
# 初始化存储所有向量的大矩阵,留意其中多一位(首行),词向量全为 0,用于 padding补零。
# 行数 为 所有单词数+1 比如 10000+1 ; 列数为 词向量“维度”比如100。
embedding_matrix = np.zeros( (len(vocab_list)+1,model_w2v.vector_size) ) #embedding 矩阵每一列都是一个词向量
print(embedding_matrix.shape)
#填充上述字典和大矩阵
for i in range(len(vocab_list)):
word = vocab_list[i] #每个词语
word_index[word] = i+1 #词语->序号
word_vector[word] = model_w2v.wv[word] #词语->词向量
embedding_matrix[i+1] = model_w2v.wv[word] #序号->词向量
(8010, 200)
def tokenizer(texts, word_index):
data = []
for sentence in texts: #遍历每一行
new_txt = []
for word in sentence.split(" "):
try:
new_txt.append(word_index[word]) # 把句子中的 词语转化为index
except:
pass
new_txt = new_txt[:SEQUENCE_LENGTH]
padding_length = SEQUENCE_LENGTH - len(new_txt)
if padding_length > 0:
new_txt += [0]*padding_length
data.append(new_txt)
return np.array(data)
X_train_tokenizer = tokenizer(data_train["text"].values,word_index)
X_test_tokenizer = tokenizer(data_test["text"].values,word_index)
x_train ,x_valid , y_train , y_valid = train_test_split(X_train_tokenizer,data_train["target"].values)
x_train.shape
(5709, 22)
def lstm(BATCH_SIZE,SEQUENCE_LENGTH,WORD_SIZE):
model = keras.Sequential()
#输入层
model.add(
keras.layers.Embedding(
input_dim = len(embedding_matrix),
output_dim = WORD_SIZE,
weights=[embedding_matrix], #预训练的词向量系数
input_length = SEQUENCE_LENGTH,
trainable = False #是否在训练过程中更新词向量
)
)
#隐层
for i in range(3):
# model.add(keras.layers.LSTM(64,activation='tanh',return_sequences=True,dropout=0.1))
model.add(keras.layers.LSTM(64,activation='tanh',return_sequences=True))
# model.add(keras.layers.Bidirectional(keras.layers.LSTM(64,activation='tanh',return_sequences=True,dropout=0.1)))
# model.add(keras.layers.Dropout(0.5))
# model.add(keras.layers.BatchNormalization())
model.add(keras.layers.LSTM(64,activation='tanh'))
#输出层
model.add(keras.layers.Dense(1,activation="sigmoid"))
model.compile(
loss="binary_crossentropy",
optimizer="adam",
metrics=["accuracy"]
)
return model
model = lstm(BATCH_SIZE,SEQUENCE_LENGTH,WORD_SIZE)
model.summary()
Model: "sequential_21"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_21 (Embedding) (None, 22, 200) 1602000
_________________________________________________________________
lstm_102 (LSTM) (None, 22, 64) 67840
_________________________________________________________________
lstm_103 (LSTM) (None, 22, 64) 33024
_________________________________________________________________
lstm_104 (LSTM) (None, 22, 64) 33024
_________________________________________________________________
lstm_105 (LSTM) (None, 64) 33024
_________________________________________________________________
dense_21 (Dense) (None, 1) 65
=================================================================
Total params: 1,768,977
Trainable params: 166,977
Non-trainable params: 1,602,000
_________________________________________________________________
history = model.fit(
x_train,
y_train,
validation_data=(x_valid,y_valid),
epochs = 60,
batch_size= BATCH_SIZE,
)
Train on 5709 samples, validate on 1904 samples
Epoch 1/60
5709/5709 [==============================] - 10s 2ms/sample - loss: 0.6199 - accuracy: 0.6614 - val_loss: 0.5872 - val_accuracy: 0.6949
Epoch 2/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.6039 - accuracy: 0.6744 - val_loss: 0.6035 - val_accuracy: 0.6702
Epoch 3/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5763 - accuracy: 0.7043 - val_loss: 0.5729 - val_accuracy: 0.7048
Epoch 4/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.5697 - accuracy: 0.7084 - val_loss: 0.5840 - val_accuracy: 0.6843
Epoch 5/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5733 - accuracy: 0.7057 - val_loss: 0.5598 - val_accuracy: 0.7159
Epoch 6/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5620 - accuracy: 0.7183 - val_loss: 0.5613 - val_accuracy: 0.7164
Epoch 7/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5538 - accuracy: 0.7208 - val_loss: 0.5454 - val_accuracy: 0.7227
Epoch 8/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.5453 - accuracy: 0.7334 - val_loss: 0.5382 - val_accuracy: 0.7327
Epoch 9/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.5509 - accuracy: 0.7255 - val_loss: 0.5407 - val_accuracy: 0.7311
Epoch 10/60
5709/5709 [==============================] - 1s 252us/sample - loss: 0.5544 - accuracy: 0.7194 - val_loss: 0.5543 - val_accuracy: 0.7337
Epoch 11/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5406 - accuracy: 0.7274 - val_loss: 0.5313 - val_accuracy: 0.7390
Epoch 12/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5377 - accuracy: 0.7325 - val_loss: 0.5391 - val_accuracy: 0.7321
Epoch 13/60
5709/5709 [==============================] - 1s 250us/sample - loss: 0.5265 - accuracy: 0.7465 - val_loss: 0.5458 - val_accuracy: 0.7321
Epoch 14/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5335 - accuracy: 0.7352 - val_loss: 0.5556 - val_accuracy: 0.7269
Epoch 15/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5299 - accuracy: 0.7437 - val_loss: 0.5301 - val_accuracy: 0.7405
Epoch 16/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5247 - accuracy: 0.7462 - val_loss: 0.5677 - val_accuracy: 0.7274
Epoch 17/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5229 - accuracy: 0.7485 - val_loss: 0.5201 - val_accuracy: 0.7426
Epoch 18/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.5315 - accuracy: 0.7359 - val_loss: 0.5246 - val_accuracy: 0.7463
Epoch 19/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5240 - accuracy: 0.7437 - val_loss: 0.5345 - val_accuracy: 0.7316
Epoch 20/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5256 - accuracy: 0.7399 - val_loss: 0.5279 - val_accuracy: 0.7468
Epoch 21/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5160 - accuracy: 0.7551 - val_loss: 0.5166 - val_accuracy: 0.7521
Epoch 22/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5224 - accuracy: 0.7462 - val_loss: 0.5210 - val_accuracy: 0.7521
Epoch 23/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5236 - accuracy: 0.7444 - val_loss: 0.5187 - val_accuracy: 0.7500
Epoch 24/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5285 - accuracy: 0.7409 - val_loss: 0.5789 - val_accuracy: 0.7447
Epoch 25/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.5236 - accuracy: 0.7493 - val_loss: 0.5223 - val_accuracy: 0.7484
Epoch 26/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5126 - accuracy: 0.7511 - val_loss: 0.5415 - val_accuracy: 0.7337
Epoch 27/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5133 - accuracy: 0.7499 - val_loss: 0.5087 - val_accuracy: 0.7558
Epoch 28/60
5709/5709 [==============================] - 1s 250us/sample - loss: 0.5091 - accuracy: 0.7502 - val_loss: 0.5145 - val_accuracy: 0.7489
Epoch 29/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5125 - accuracy: 0.7493 - val_loss: 0.5141 - val_accuracy: 0.7505
Epoch 30/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5104 - accuracy: 0.7514 - val_loss: 0.5357 - val_accuracy: 0.7285
Epoch 31/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.5051 - accuracy: 0.7500 - val_loss: 0.5242 - val_accuracy: 0.7384
Epoch 32/60
5709/5709 [==============================] - 1s 250us/sample - loss: 0.5008 - accuracy: 0.7588 - val_loss: 0.5083 - val_accuracy: 0.7526
Epoch 33/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.5096 - accuracy: 0.7527 - val_loss: 0.5223 - val_accuracy: 0.7463
Epoch 34/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.4977 - accuracy: 0.7581 - val_loss: 0.5096 - val_accuracy: 0.7563
Epoch 35/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5061 - accuracy: 0.7555 - val_loss: 0.5132 - val_accuracy: 0.7574
Epoch 36/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5076 - accuracy: 0.7544 - val_loss: 0.5090 - val_accuracy: 0.7558
Epoch 37/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.4993 - accuracy: 0.7613 - val_loss: 0.5265 - val_accuracy: 0.7468
Epoch 38/60
5709/5709 [==============================] - 1s 244us/sample - loss: 0.4971 - accuracy: 0.7630 - val_loss: 0.5162 - val_accuracy: 0.7579
Epoch 39/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4979 - accuracy: 0.7623 - val_loss: 0.5178 - val_accuracy: 0.7426
Epoch 40/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.4977 - accuracy: 0.7586 - val_loss: 0.5048 - val_accuracy: 0.7537
Epoch 41/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.4993 - accuracy: 0.7642 - val_loss: 0.5071 - val_accuracy: 0.7516
Epoch 42/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4915 - accuracy: 0.7628 - val_loss: 0.5114 - val_accuracy: 0.7600
Epoch 43/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.4857 - accuracy: 0.7705 - val_loss: 0.5226 - val_accuracy: 0.7558
Epoch 44/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4879 - accuracy: 0.7676 - val_loss: 0.5074 - val_accuracy: 0.7647
Epoch 45/60
5709/5709 [==============================] - 1s 244us/sample - loss: 0.4837 - accuracy: 0.7709 - val_loss: 0.5122 - val_accuracy: 0.7574
Epoch 46/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4837 - accuracy: 0.7681 - val_loss: 0.5023 - val_accuracy: 0.7558
Epoch 47/60
5709/5709 [==============================] - 1s 244us/sample - loss: 0.4820 - accuracy: 0.7688 - val_loss: 0.5541 - val_accuracy: 0.7279
Epoch 48/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4844 - accuracy: 0.7665 - val_loss: 0.5059 - val_accuracy: 0.7521
Epoch 49/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4790 - accuracy: 0.7739 - val_loss: 0.5055 - val_accuracy: 0.7621
Epoch 50/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.4820 - accuracy: 0.7716 - val_loss: 0.5064 - val_accuracy: 0.7579
Epoch 51/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.4840 - accuracy: 0.7672 - val_loss: 0.5160 - val_accuracy: 0.7521
Epoch 52/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4745 - accuracy: 0.7733 - val_loss: 0.5221 - val_accuracy: 0.7584
Epoch 53/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4749 - accuracy: 0.7758 - val_loss: 0.5186 - val_accuracy: 0.7474
Epoch 54/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4699 - accuracy: 0.7775 - val_loss: 0.5061 - val_accuracy: 0.7568
Epoch 55/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4683 - accuracy: 0.7767 - val_loss: 0.5178 - val_accuracy: 0.7432
Epoch 56/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4728 - accuracy: 0.7756 - val_loss: 0.5245 - val_accuracy: 0.7516
Epoch 57/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4718 - accuracy: 0.7730 - val_loss: 0.5179 - val_accuracy: 0.7495
Epoch 58/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4681 - accuracy: 0.7781 - val_loss: 0.5209 - val_accuracy: 0.7547
Epoch 59/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4612 - accuracy: 0.7847 - val_loss: 0.5193 - val_accuracy: 0.7521
Epoch 60/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4562 - accuracy: 0.7886 - val_loss: 0.5204 - val_accuracy: 0.7563
def draw(history):
data = history.history
data = pd.DataFrame(data)
data.plot()
plt.show()
draw(history)
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-FVgzpJGV-1587653490846)(output_16_0.png)]