Kaggle_tweet_emotion_w2v_rnn

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from gensim.models.word2vec import Word2Vec
import tqdm
import tensorflow as tf
print(tf.__version__)
from tensorflow import keras
from sklearn.model_selection import train_test_split
from tensorflow.keras import Sequential as Sequential
import nltk
from nltk.corpus import stopwords

2.0.0
# set config 
BATCH_SIZE = 64
SEQUENCE_LENGTH = 22
WORD_SIZE = 200

log

1:w2v + 1 layer lstm : 0.77(valid data)

2:w2v + 2 layer lstm : 0.77

3:w2v + 4 layer lstm : 0.77

4:w2v + 1 layer bi-lstm : 0.76

5:w2v + 4 layer bi-lstm : 0.77

6:w2v + 4 layer bi-lstm + dropout(0.5 层间): 0.77

7:w2v + 4 layer bi-lstm + dropout(0.5 层间): 0.77

8:w2v + 8 layer bi-lstm + dropout(0.5 层间): 0.76

9:w2v + 3 layer bi-lstm + dropout(0.4 input gate 之前):0.76

10:w2v + 3 layer bi-lstm + dropout(0.3 input gate 之前):0.76

11:w2v + 3 layer bi-lstm + dropout(0.3 input gate 之前) + batch = 32(加倍batch):0.76

12:w2v + 3 layer bi-lstm + dropout(0.1 input gate 之前) + batch = 64 :0.76

小结:bidirection 在这个案例中无效

#load data
path_home = "/home/lowry/pro/kaggle_tweets/kaggle_tweets_emotion"
data_train = pd.read_csv(os.path.join(path_home,"train.csv"))
data_test = pd.read_csv(os.path.join(path_home,"test.csv"))
data_submit = pd.read_csv(os.path.join(path_home,"sample_submission.csv"))
data_train.head()
idkeywordlocationtexttarget
01NaNNaNOur Deeds are the Reason of this #earthquake M...1
14NaNNaNForest fire near La Ronge Sask. Canada1
25NaNNaNAll residents asked to 'shelter in place' are ...1
36NaNNaN13,000 people receive #wildfires evacuation or...1
47NaNNaNJust got sent this photo from Ruby #Alaska as ...1
# data clearn

stopwords_english = stopwords.words("english")

import re
def cleanword(s):
    s = s.lower()
    s = " ".join([word for word in s.split(" ") if word not in stopwords_english])
    temp = re.findall("http\S*",s)  
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
    temp = re.findall("@\S*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
    temp = re.findall("\d*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr," ")
    temp = re.findall("\x89\S*",s)
    for deletStr in temp:
        if deletStr != "":
            s = s.replace(deletStr[:5]," ")

    s = s.replace("\n"," ")
    s = s.replace(","," ")
    s = s.replace("?"," ")
    s = s.replace("..."," ")
    s = s.replace("."," ")
    s = s.replace("["," ")
    s = s.replace("]"," ")
    s = s.replace("!"," ")
    s = s.replace(":"," ")
    s = s.replace("-"," ")
    s = s.replace("#"," ")
    s = s.replace("|"," ")
    s = s.replace("("," ")
    s = s.replace(")"," ")
    s = s.replace(";"," ")
    s = s.replace("="," ")
    s = s.replace(">"," ")
    s = s.replace("<"," ")
    s = s.replace("/"," ")
    

    #delet conntinue " "
    s_new_list = [word for word in s.split(" ") if word != " "]
    s_new = " ".join(s_new_list)
            
    s_new = s_new.strip()
    return s_new
data_test['text'] = data_test['text'].apply(cleanword)
data_train['text'] = data_train['text'].apply(cleanword)

data_train_copy = data_train.copy()
data_train_copy["text_length"] = data_train_copy["text"].apply(lambda s : len(s.split(" ")))
data_test_copy = data_train.copy()
data_test_copy["text_length"] = data_train_copy["text"].apply(lambda s : len(s.split(" ")))
print("train_text_words_length(95%) = ",np.percentile(data_train_copy["text_length"].tolist(),95))
print("test_text_words_length(95%) = ",np.percentile(data_test_copy["text_length"].tolist(),95))
train_text_words_length(95%) =  22.0
test_text_words_length(95%) =  22.0
# print(data_train_copy)
# print(data_train_copy.info(memory_usage=True))
1
#train word2vec model 


# sentences = []
# for line in data_train['text'].values:
#     sentences.append(list(line.split(" ")))
# for line in data_test['text'].values:
#     sentences.append(list(line.split(" ")))
# print(len(sentences))
# path_model_w2v = os.path.join(path_home,"w2v_model.model")
# model_w2v = Word2Vec(
#         sentences=sentences,
#         size=200,#维度
#         alpha=0.025, #默认
#         window=5, #默认
#         min_count=2,#2,3
#         sample=0.001,#
#         seed=2018, #
#         workers=11, #线程
#         min_alpha=0.0001, 
#         sg=0, #cbow
#         hs=0, #负采样
#         negative=5,#负采样个数
#         ns_exponent=0.75, 
#         cbow_mean=1,#求和再取平均
#         iter=10 #10到15
#         )
# model_w2v.save(path_model_w2v)




# test: 
# model_w2v = Word2Vec.load(path_model_w2v)
# print(model_w2v)
# model_w2v.wv["our"].shape
# for word , wordInfo in model_w2v.wv.vocab.items():
#     print("word = ",word)
#     print("wordInfo = ",wordInfo )
#     break

10876
#build word2vec map
path_model_w2v = os.path.join(path_home,"w2v_model.model")
model_w2v = Word2Vec.load(path_model_w2v)
vocab_list = [word for word, Vocab in model_w2v.wv.vocab.items()]# 存储 所有的 词语
word_index = {" ": 0}# 初始化 `[word : token]` ,后期 tokenize 语料库就是用该词典 (把词语改成下表)。
word_vector = {} # 初始化`[word : vector]`字典
# 初始化存储所有向量的大矩阵,留意其中多一位(首行),词向量全为 0,用于 padding补零。
# 行数 为 所有单词数+1 比如 10000+1 ; 列数为 词向量“维度”比如100。
embedding_matrix = np.zeros( (len(vocab_list)+1,model_w2v.vector_size) )  #embedding 矩阵每一列都是一个词向量
print(embedding_matrix.shape)

#填充上述字典和大矩阵
for i in range(len(vocab_list)):
    word = vocab_list[i]   #每个词语
    word_index[word] = i+1 #词语->序号
    word_vector[word] = model_w2v.wv[word] #词语->词向量
    embedding_matrix[i+1] = model_w2v.wv[word] #序号->词向量
(8010, 200)
def tokenizer(texts, word_index):
    data = []
    for sentence in texts:  #遍历每一行
        new_txt = []
        for word in sentence.split(" "):
            try:
                new_txt.append(word_index[word])  # 把句子中的 词语转化为index
            except:
                pass
        
        new_txt = new_txt[:SEQUENCE_LENGTH]
        padding_length = SEQUENCE_LENGTH - len(new_txt)
        if padding_length > 0:
            new_txt += [0]*padding_length
        data.append(new_txt)
    return np.array(data)

X_train_tokenizer = tokenizer(data_train["text"].values,word_index)
X_test_tokenizer = tokenizer(data_test["text"].values,word_index)


x_train ,x_valid , y_train , y_valid = train_test_split(X_train_tokenizer,data_train["target"].values)
x_train.shape
(5709, 22)
def lstm(BATCH_SIZE,SEQUENCE_LENGTH,WORD_SIZE):
    model = keras.Sequential()
    #输入层
    model.add(    
        keras.layers.Embedding(
            input_dim = len(embedding_matrix),
            output_dim = WORD_SIZE,
            weights=[embedding_matrix],     #预训练的词向量系数
            input_length = SEQUENCE_LENGTH,
            trainable = False                #是否在训练过程中更新词向量
        ) 
    )
    

    
    
    #隐层
    for i in range(3):
#         model.add(keras.layers.LSTM(64,activation='tanh',return_sequences=True,dropout=0.1))
        model.add(keras.layers.LSTM(64,activation='tanh',return_sequences=True))
#         model.add(keras.layers.Bidirectional(keras.layers.LSTM(64,activation='tanh',return_sequences=True,dropout=0.1)))
#         model.add(keras.layers.Dropout(0.5))
#         model.add(keras.layers.BatchNormalization())

    model.add(keras.layers.LSTM(64,activation='tanh'))
    
    
    #输出层
    model.add(keras.layers.Dense(1,activation="sigmoid"))
    
    model.compile(
        loss="binary_crossentropy",
        optimizer="adam",
        metrics=["accuracy"]
    )
    return model
model = lstm(BATCH_SIZE,SEQUENCE_LENGTH,WORD_SIZE)
model.summary()
Model: "sequential_21"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding_21 (Embedding)     (None, 22, 200)           1602000   
_________________________________________________________________
lstm_102 (LSTM)              (None, 22, 64)            67840     
_________________________________________________________________
lstm_103 (LSTM)              (None, 22, 64)            33024     
_________________________________________________________________
lstm_104 (LSTM)              (None, 22, 64)            33024     
_________________________________________________________________
lstm_105 (LSTM)              (None, 64)                33024     
_________________________________________________________________
dense_21 (Dense)             (None, 1)                 65        
=================================================================
Total params: 1,768,977
Trainable params: 166,977
Non-trainable params: 1,602,000
_________________________________________________________________
history = model.fit(
    x_train,
    y_train,
    validation_data=(x_valid,y_valid),
    epochs = 60,
    batch_size= BATCH_SIZE,
)
Train on 5709 samples, validate on 1904 samples
Epoch 1/60
5709/5709 [==============================] - 10s 2ms/sample - loss: 0.6199 - accuracy: 0.6614 - val_loss: 0.5872 - val_accuracy: 0.6949
Epoch 2/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.6039 - accuracy: 0.6744 - val_loss: 0.6035 - val_accuracy: 0.6702
Epoch 3/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5763 - accuracy: 0.7043 - val_loss: 0.5729 - val_accuracy: 0.7048
Epoch 4/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.5697 - accuracy: 0.7084 - val_loss: 0.5840 - val_accuracy: 0.6843
Epoch 5/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5733 - accuracy: 0.7057 - val_loss: 0.5598 - val_accuracy: 0.7159
Epoch 6/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5620 - accuracy: 0.7183 - val_loss: 0.5613 - val_accuracy: 0.7164
Epoch 7/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5538 - accuracy: 0.7208 - val_loss: 0.5454 - val_accuracy: 0.7227
Epoch 8/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.5453 - accuracy: 0.7334 - val_loss: 0.5382 - val_accuracy: 0.7327
Epoch 9/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.5509 - accuracy: 0.7255 - val_loss: 0.5407 - val_accuracy: 0.7311
Epoch 10/60
5709/5709 [==============================] - 1s 252us/sample - loss: 0.5544 - accuracy: 0.7194 - val_loss: 0.5543 - val_accuracy: 0.7337
Epoch 11/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5406 - accuracy: 0.7274 - val_loss: 0.5313 - val_accuracy: 0.7390
Epoch 12/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5377 - accuracy: 0.7325 - val_loss: 0.5391 - val_accuracy: 0.7321
Epoch 13/60
5709/5709 [==============================] - 1s 250us/sample - loss: 0.5265 - accuracy: 0.7465 - val_loss: 0.5458 - val_accuracy: 0.7321
Epoch 14/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5335 - accuracy: 0.7352 - val_loss: 0.5556 - val_accuracy: 0.7269
Epoch 15/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5299 - accuracy: 0.7437 - val_loss: 0.5301 - val_accuracy: 0.7405
Epoch 16/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5247 - accuracy: 0.7462 - val_loss: 0.5677 - val_accuracy: 0.7274
Epoch 17/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5229 - accuracy: 0.7485 - val_loss: 0.5201 - val_accuracy: 0.7426
Epoch 18/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.5315 - accuracy: 0.7359 - val_loss: 0.5246 - val_accuracy: 0.7463
Epoch 19/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5240 - accuracy: 0.7437 - val_loss: 0.5345 - val_accuracy: 0.7316
Epoch 20/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5256 - accuracy: 0.7399 - val_loss: 0.5279 - val_accuracy: 0.7468
Epoch 21/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.5160 - accuracy: 0.7551 - val_loss: 0.5166 - val_accuracy: 0.7521
Epoch 22/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5224 - accuracy: 0.7462 - val_loss: 0.5210 - val_accuracy: 0.7521
Epoch 23/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5236 - accuracy: 0.7444 - val_loss: 0.5187 - val_accuracy: 0.7500
Epoch 24/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5285 - accuracy: 0.7409 - val_loss: 0.5789 - val_accuracy: 0.7447
Epoch 25/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.5236 - accuracy: 0.7493 - val_loss: 0.5223 - val_accuracy: 0.7484
Epoch 26/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5126 - accuracy: 0.7511 - val_loss: 0.5415 - val_accuracy: 0.7337
Epoch 27/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5133 - accuracy: 0.7499 - val_loss: 0.5087 - val_accuracy: 0.7558
Epoch 28/60
5709/5709 [==============================] - 1s 250us/sample - loss: 0.5091 - accuracy: 0.7502 - val_loss: 0.5145 - val_accuracy: 0.7489
Epoch 29/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5125 - accuracy: 0.7493 - val_loss: 0.5141 - val_accuracy: 0.7505
Epoch 30/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.5104 - accuracy: 0.7514 - val_loss: 0.5357 - val_accuracy: 0.7285
Epoch 31/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.5051 - accuracy: 0.7500 - val_loss: 0.5242 - val_accuracy: 0.7384
Epoch 32/60
5709/5709 [==============================] - 1s 250us/sample - loss: 0.5008 - accuracy: 0.7588 - val_loss: 0.5083 - val_accuracy: 0.7526
Epoch 33/60
5709/5709 [==============================] - 1s 249us/sample - loss: 0.5096 - accuracy: 0.7527 - val_loss: 0.5223 - val_accuracy: 0.7463
Epoch 34/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.4977 - accuracy: 0.7581 - val_loss: 0.5096 - val_accuracy: 0.7563
Epoch 35/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5061 - accuracy: 0.7555 - val_loss: 0.5132 - val_accuracy: 0.7574
Epoch 36/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.5076 - accuracy: 0.7544 - val_loss: 0.5090 - val_accuracy: 0.7558
Epoch 37/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.4993 - accuracy: 0.7613 - val_loss: 0.5265 - val_accuracy: 0.7468
Epoch 38/60
5709/5709 [==============================] - 1s 244us/sample - loss: 0.4971 - accuracy: 0.7630 - val_loss: 0.5162 - val_accuracy: 0.7579
Epoch 39/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4979 - accuracy: 0.7623 - val_loss: 0.5178 - val_accuracy: 0.7426
Epoch 40/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.4977 - accuracy: 0.7586 - val_loss: 0.5048 - val_accuracy: 0.7537
Epoch 41/60
5709/5709 [==============================] - 1s 248us/sample - loss: 0.4993 - accuracy: 0.7642 - val_loss: 0.5071 - val_accuracy: 0.7516
Epoch 42/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4915 - accuracy: 0.7628 - val_loss: 0.5114 - val_accuracy: 0.7600
Epoch 43/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.4857 - accuracy: 0.7705 - val_loss: 0.5226 - val_accuracy: 0.7558
Epoch 44/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4879 - accuracy: 0.7676 - val_loss: 0.5074 - val_accuracy: 0.7647
Epoch 45/60
5709/5709 [==============================] - 1s 244us/sample - loss: 0.4837 - accuracy: 0.7709 - val_loss: 0.5122 - val_accuracy: 0.7574
Epoch 46/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4837 - accuracy: 0.7681 - val_loss: 0.5023 - val_accuracy: 0.7558
Epoch 47/60
5709/5709 [==============================] - 1s 244us/sample - loss: 0.4820 - accuracy: 0.7688 - val_loss: 0.5541 - val_accuracy: 0.7279
Epoch 48/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4844 - accuracy: 0.7665 - val_loss: 0.5059 - val_accuracy: 0.7521
Epoch 49/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4790 - accuracy: 0.7739 - val_loss: 0.5055 - val_accuracy: 0.7621
Epoch 50/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.4820 - accuracy: 0.7716 - val_loss: 0.5064 - val_accuracy: 0.7579
Epoch 51/60
5709/5709 [==============================] - 1s 245us/sample - loss: 0.4840 - accuracy: 0.7672 - val_loss: 0.5160 - val_accuracy: 0.7521
Epoch 52/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4745 - accuracy: 0.7733 - val_loss: 0.5221 - val_accuracy: 0.7584
Epoch 53/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4749 - accuracy: 0.7758 - val_loss: 0.5186 - val_accuracy: 0.7474
Epoch 54/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4699 - accuracy: 0.7775 - val_loss: 0.5061 - val_accuracy: 0.7568
Epoch 55/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4683 - accuracy: 0.7767 - val_loss: 0.5178 - val_accuracy: 0.7432
Epoch 56/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4728 - accuracy: 0.7756 - val_loss: 0.5245 - val_accuracy: 0.7516
Epoch 57/60
5709/5709 [==============================] - 1s 247us/sample - loss: 0.4718 - accuracy: 0.7730 - val_loss: 0.5179 - val_accuracy: 0.7495
Epoch 58/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4681 - accuracy: 0.7781 - val_loss: 0.5209 - val_accuracy: 0.7547
Epoch 59/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4612 - accuracy: 0.7847 - val_loss: 0.5193 - val_accuracy: 0.7521
Epoch 60/60
5709/5709 [==============================] - 1s 246us/sample - loss: 0.4562 - accuracy: 0.7886 - val_loss: 0.5204 - val_accuracy: 0.7563
def draw(history):
    data = history.history
    data = pd.DataFrame(data)
    data.plot()
    plt.show()
draw(history)

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-FVgzpJGV-1587653490846)(output_16_0.png)]

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值