IMDB电影数据集含有50000个电影评论,分为正反两类。train包中包含25000个正反类评论用于训练,test中包含25000个评论用于测试。
在keras提供的IMDB数据集中,word被映射为一个大于0的整数,表示该单词出现频率的排名,其中0用于表示unknown word。
下面首先对IMDB数据集进行基本的统计分析,然后分别使用全连接、卷积、LSTM三种神经网络分别来进行情感分析。
一、数据集的统计分析
import keras
import numpy as np
from keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data()
np.reshape(X_train[0], (1, -1))
'''
array([[ 1, 14, 22, 16, 43, 530, 973, 1622, 1385,
65, 458, 4468, 66, 3941, 4, 173, 36, 256,
5, 25, 100, 43, 838, 112, 50, 670, 22665,
9, 35, 480, 284, 5, 150, 4, 172, 112,
167, 21631, 336, 385, 39, 4, 172, 4536, 1111,
17, 546, 38, 13, 447, 4, 192, 50, 16,
6, 147, 2025, 19, 14, 22, 4, 1920, 4613,
469, 4, 22, 71, 87, 12, 16, 43, 530,
38, 76, 15, 13, 1247, 4, 22, 17, 515,
17, 12, 16, 626, 18, 19193, 5, 62, 386,
12, 8, 316, 8, 106, 5, 4, 2223, 5244,
16, 480, 66, 3785, 33, 4, 130, 12, 16,
38, 619, 5, 25, 124, 51, 36, 135, 48,
25, 1415, 33, 6, 22, 12, 215, 28, 77,
52, 5, 14, 407, 16, 82, 10311, 8, 4,
107, 117, 5952, 15, 256, 4, 31050, 7, 3766,
5, 723, 36, 71, 43, 530, 476, 26, 400,
317, 46, 7, 4, 12118, 1029, 13, 104, 88,
4, 381, 15, 297, 98, 32, 2071, 56, 26,
141, 6, 194, 7486, 18, 4, 226, 22, 21,
134, 476, 26, 480, 5, 144, 30, 5535, 18,
51, 36, 28, 224, 92, 25, 104, 4, 226,
65, 16, 38, 1334, 88, 12, 16, 283, 5,
16, 4472, 113, 103, 32, 15, 16, 5345, 19,
178, 32]])
'''
print(X_train.shape)
print(y_train.shape)
'''
(25000,)
(25000,)
'''
avg_len = list(map(len, X_train))
np.mean(avg_len)
'''
238.71364
'''
import matplotlib.pyplot as plt
plt.hist(avg_len, bins=range(min(avg_len), max(avg_len) + 50, 50))
plt.show()
二、全连接神经网络
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
import keras
import numpy as np
from keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data()
m=max(list(map(len, X_train))+ list(map(len, X_test)))
print(m)
'''
2494
'''
maxword = 400
X_train = sequence.pad_sequences(X_train, maxlen = maxword)
X_test = sequence.pad_sequences(X_test, maxlen = maxword)
vocab_size = np.max([np.max(X_train[i]) for i in range(X_train.shape[0])]) + 1
model = Sequential()
model.add(Embedding(vocab_size, 64, input_length = maxword))
model.add(Flatten())
model.add(Dense(500, activation = 'relu'))
model.add(Dense(500, activation = 'relu'))
model.add(Dense(200, activation = 'relu'))
model.add(Dense(50, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())
'''
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_3 (Embedding) (None, 400, 64) 5669568
_________________________________________________________________
flatten_3 (Flatten) (None, 25600) 0
_________________________________________________________________
dense_11 (Dense) (None, 500) 12800500
_________________________________________________________________
dense_12 (Dense) (None, 500) 250500
_________________________________________________________________
dense_13 (Dense) (None, 200) 100200
_________________________________________________________________
dense_14 (Dense) (None, 50) 10050
_________________________________________________________________
dense_15 (Dense) (None, 1) 51
=================================================================
Total params: 18,830,869
Trainable params: 18,830,869
Non-trainable params: 0
_________________________________________________________________
None
'''
model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 20,batch_size = 100, verbose = 1)
score = model.evaluate(X_test, y_test)
'''
Train on 25000 samples, validate on 25000 samples
Epoch 1/20
25000/25000 [==============================] - 10s - loss: 0.4514 - acc: 0.7513 - val_loss: 0.2960 - val_acc: 0.8750
Epoch 2/20
25000/25000 [==============================] - 10s - loss: 0.0963 - acc: 0.9676 - val_loss: 0.3743 - val_acc: 0.8601
Epoch 3/20
25000/25000 [==============================] - 10s - loss: 0.0049 - acc: 0.9985 - val_loss: 0.6467 - val_acc: 0.8591
Epoch 4/20
25000/25000 [==============================] - 10s - loss: 5.1717e-04 - acc: 0.9999 - val_loss: 1.0052 - val_acc: 0.8559
Epoch 5/20
25000/25000 [==============================] - 10s - loss: 7.6195e-06 - acc: 1.0000 - val_loss: 1.0526 - val_acc: 0.8570
Epoch 6/20
25000/25000 [==============================] - 10s - loss: 3.2650e-06 - acc: 1.0000 - val_loss: 1.0866 - val_acc: 0.8564
Epoch 7/20
25000/25000 [==============================] - 10s - loss: 1.9993e-06 - acc: 1.0000 - val_loss: 1.1213 - val_acc: 0.8561
Epoch 8/20
25000/25000 [==============================] - 10s - loss: 1.1644e-06 - acc: 1.0000 - val_loss: 1.1707 - val_acc: 0.8570
Epoch 9/20
25000/25000 [==============================] - 10s - loss: 6.6276e-07 - acc: 1.0000 - val_loss: 1.2164 - val_acc: 0.8574
Epoch 10/20
25000/25000 [==============================] - 10s - loss: 3.9984e-07 - acc: 1.0000 - val_loss: 1.2884 - val_acc: 0.8564
Epoch 11/20
25000/25000 [==============================] - 10s - loss: 1.5278e-07 - acc: 1.0000 - val_loss: 1.3796 - val_acc: 0.8566
Epoch 12/20
25000/25000 [==============================] - 10s - loss: 1.0723e-07 - acc: 1.0000 - val_loss: 1.4194 - val_acc: 0.8568
Epoch 13/20
25000/25000 [==============================] - 10s - loss: 9.2242e-08 - acc: 1.0000 - val_loss: 1.4465 - val_acc: 0.8568
Epoch 14/20
25000/25000 [==============================] - 10s - loss: 8.4448e-08 - acc: 1.0000 - val_loss: 1.4663 - val_acc: 0.8566
Epoch 15/20
25000/25000 [==============================] - 10s - loss: 7.9802e-08 - acc: 1.0000 - val_loss: 1.4831 - val_acc: 0.8566
Epoch 16/20
25000/25000 [==============================] - 10s - loss: 7.6812e-08 - acc: 1.0000 - val_loss: 1.4976 - val_acc: 0.8566
Epoch 17/20
25000/25000 [==============================] - 10s - loss: 7.4742e-08 - acc: 1.0000 - val_loss: 1.5098 - val_acc: 0.8566
Epoch 18/20
25000/25000 [==============================] - 10s - loss: 7.3247e-08 - acc: 1.0000 - val_loss: 1.5203 - val_acc: 0.8566
Epoch 19/20
25000/25000 [==============================] - 10s - loss: 7.2189e-08 - acc: 1.0000 - val_loss: 1.5304 - val_acc: 0.8565
Epoch 20/20
25000/25000 [==============================] - 10s - loss: 7.1346e-08 - acc: 1.0000 - val_loss: 1.5390 - val_acc: 0.8564
24800/25000 [============================>.] - ETA: 0s
'''
print(score)
'''
[1.5390439252797141, 0.85643999999999998]
'''
三、卷积神经网络
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.layers import Conv1D, MaxPooling1D
model = Sequential()
model.add(Embedding(vocab_size, 64, input_length = maxword))
model.add(Conv1D(filters = 64, kernel_size = 3, padding = 'same', activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
model.add(Dropout(0.25))
model.add(Conv1D(filters = 128, kernel_size = 3, padding = 'same',activation = 'relu'))
model.add(MaxPooling1D(pool_size = 2))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(64, activation = 'relu'))
model.add(Dense(32, activation = 'relu'))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', metrics = ['accuracy'])
print(model.summary())
'''
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_5 (Embedding) (None, 400, 64) 5669568
_________________________________________________________________
conv1d_9 (Conv1D) (None, 400, 64) 12352
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 200, 64) 0
_________________________________________________________________
dropout_9 (Dropout) (None, 200, 64) 0
_________________________________________________________________
conv1d_10 (Conv1D) (None, 200, 128) 24704
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 100, 128) 0
_________________________________________________________________
dropout_10 (Dropout) (None, 100, 128) 0
_________________________________________________________________
flatten_5 (Flatten) (None, 12800) 0
_________________________________________________________________
dense_13 (Dense) (None, 64) 819264
_________________________________________________________________
dense_14 (Dense) (None, 32) 2080
_________________________________________________________________
dense_15 (Dense) (None, 1) 33
=================================================================
Total params: 6,528,001
Trainable params: 6,528,001
Non-trainable params: 0
_________________________________________________________________
None
'''
model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 20, batch_size = 100)
scores = model.evaluate(X_test, y_test, verbose = 1)
print(scores)
'''
Train on 25000 samples, validate on 25000 samples
Epoch 1/20
25000/25000 [==============================] - 12s - loss: 0.4860 - acc: 0.7189 - val_loss: 0.2693 - val_acc: 0.8892
Epoch 2/20
25000/25000 [==============================] - 10s - loss: 0.2236 - acc: 0.9138 - val_loss: 0.2645 - val_acc: 0.8916
Epoch 3/20
25000/25000 [==============================] - 10s - loss: 0.1587 - acc: 0.9424 - val_loss: 0.3100 - val_acc: 0.8764
Epoch 4/20
25000/25000 [==============================] - 10s - loss: 0.1142 - acc: 0.9597 - val_loss: 0.3152 - val_acc: 0.8799
Epoch 5/20
25000/25000 [==============================] - 10s - loss: 0.0890 - acc: 0.9699 - val_loss: 0.3662 - val_acc: 0.8758
Epoch 6/20
25000/25000 [==============================] - 10s - loss: 0.0641 - acc: 0.9790 - val_loss: 0.4649 - val_acc: 0.8646
Epoch 7/20
25000/25000 [==============================] - 10s - loss: 0.0493 - acc: 0.9840 - val_loss: 0.4959 - val_acc: 0.8565
Epoch 8/20
25000/25000 [==============================] - 10s - loss: 0.0383 - acc: 0.9884 - val_loss: 0.5187 - val_acc: 0.8586
Epoch 9/20
25000/25000 [==============================] - 10s - loss: 0.0267 - acc: 0.9916 - val_loss: 0.7127 - val_acc: 0.8458
Epoch 10/20
25000/25000 [==============================] - 10s - loss: 0.0198 - acc: 0.9939 - val_loss: 1.1308 - val_acc: 0.830004 - ET
Epoch 11/20
25000/25000 [==============================] - 10s - loss: 0.0123 - acc: 0.9963 - val_loss: 0.8761 - val_acc: 0.8527
Epoch 12/20
25000/25000 [==============================] - 10s - loss: 0.0090 - acc: 0.9977 - val_loss: 1.0457 - val_acc: 0.8488
Epoch 13/20
25000/25000 [==============================] - 10s - loss: 0.0072 - acc: 0.9982 - val_loss: 1.1228 - val_acc: 0.8425
Epoch 14/20
25000/25000 [==============================] - 10s - loss: 0.0051 - acc: 0.9993 - val_loss: 1.8203 - val_acc: 0.8360
Epoch 15/20
25000/25000 [==============================] - 10s - loss: 0.0036 - acc: 0.9992 - val_loss: 1.7642 - val_acc: 0.8458acc:
Epoch 16/20
25000/25000 [==============================] - 10s - loss: 0.0043 - acc: 0.9995 - val_loss: 1.7908 - val_acc: 0.8468
Epoch 17/20
25000/25000 [==============================] - 10s - loss: 0.0021 - acc: 0.9998 - val_loss: 1.9277 - val_acc: 0.8447
Epoch 18/20
25000/25000 [==============================] - 10s - loss: 0.0122 - acc: 0.9987 - val_loss: 1.7697 - val_acc: 0.8481
Epoch 19/20
25000/25000 [==============================] - 10s - loss: 0.0035 - acc: 0.9996 - val_loss: 1.8187 - val_acc: 0.8444
Epoch 20/20
25000/25000 [==============================] - 10s - loss: 0.0056 - acc: 0.9991 - val_loss: 1.7834 - val_acc: 0.8423
24960/25000 [============================>.] - ETA: 0s
'''
print(scores)
'''
[1.7833950957139582, 0.84228000000000003]
'''
四、LSTM神经网络
from keras.layers import LSTM
model = Sequential()
model.add(Embedding(vocab_size, 64, input_length = maxword))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(64, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(32))
model.add(Dropout(0.2))
model.add(Dense(1, activation = 'sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer = 'rmsprop', metrics = ['accuracy'])
print(model.summary())
'''
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding_12 (Embedding) (None, 400, 64) 5669568
_________________________________________________________________
lstm_12 (LSTM) (None, 400, 128) 98816
_________________________________________________________________
dropout_16 (Dropout) (None, 400, 128) 0
_________________________________________________________________
lstm_13 (LSTM) (None, 400, 64) 49408
_________________________________________________________________
dropout_17 (Dropout) (None, 400, 64) 0
_________________________________________________________________
lstm_14 (LSTM) (None, 32) 12416
_________________________________________________________________
dropout_18 (Dropout) (None, 32) 0
_________________________________________________________________
dense_16 (Dense) (None, 1) 33
=================================================================
Total params: 5,830,241
Trainable params: 5,830,241
Non-trainable params: 0
_________________________________________________________________
None
'''
model.fit(X_train, y_train, validation_data = (X_test, y_test), epochs = 5, batch_size = 100)
scores = model.evaluate(X_test, y_test)
'''
Train on 25000 samples, validate on 25000 samples
Epoch 1/5
25000/25000 [==============================] - 546s - loss: 0.5645 - acc: 0.6731 - val_loss: 0.3635 - val_acc: 0.8475
Epoch 2/5
25000/25000 [==============================] - 538s - loss: 0.3497 - acc: 0.8590 - val_loss: 0.3994 - val_acc: 0.8179
Epoch 3/5
25000/25000 [==============================] - 537s - loss: 0.2460 - acc: 0.9079 - val_loss: 0.3249 - val_acc: 0.8659
Epoch 4/5
25000/25000 [==============================] - 534s - loss: 0.1802 - acc: 0.9357 - val_loss: 0.3177 - val_acc: 0.8644
Epoch 5/5
25000/25000 [==============================] - 534s - loss: 0.1425 - acc: 0.9505 - val_loss: 0.3417 - val_acc: 0.8508
25000/25000 [==============================] - 461s
'''
print(scores)
'''
[0.34172198061943054, 0.85075999999999996]
'''