tensorflow框架学习-第二课 Text classification with movie reviews

课程来源:Text classification with movie reviews

Text classification with movie reviews

二分类(1:好评,0:差评)构建电影评论的分类模型

from __future__ import absolute_import, division, print_function

import tensorflow as tf
from tensorflow import keras
from data.tools import load_Localdata

import numpy as np

print(tf.__version__)
1.13.1

1 下载和查看数据集

# 下载npz文件中的训练数据和测试数据
fname = 'imdb.npz'
path = '/home/brian/Documents/tensorflow-gpu/tensorflow-learning/data/imdb/'
(train_data, train_labels), (test_data, test_labels) \
= load_Localdata(fname=fname, path=path, num_words=10000)
# 查看数据集的结构
print(type(train_data))
print("train_data.shape: ", train_data.shape)
print("train_labels.shape: ", train_labels.shape)
print("test_data.shape: ", test_data.shape)
print("test_labels.shape: ", test_labels.shape)
<class 'numpy.ndarray'>
train_data.shape:  (25000,)
train_labels.shape:  (25000,)
test_data.shape:  (25000,)
test_labels.shape:  (25000,)

训练数据集和测试数据集都有25000个

print("Training entries: {}, labels: {}".format(len(train_data), len(train_labels)))
Training entries: 25000, labels: 25000
print(type(train_data[0]))
print(train_data[0])
<class 'list'>
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]

每个数据由列表组成,列表中存放评价的每个字符的标记

print(train_labels)
[1 0 0 ... 0 1 0]

二分类标签(0,1),其中1代表好评,0代表差评

len(train_data[0]), len(train_data[1])
(218, 189)

每个数据列表中存放的字符不一致,即评论都不是一样长,符合常识

将以上的整数字符转换为实义字符(真实字符)

from tensorflow.python.keras.utils.data_utils import get_file
import json
def get_word_index(path=None):
    with open(path) as f:
        return json.load(f)
# json中存放着字典,键为实义字符,值为字符的标记
path = '/home/brian/Documents/tensorflow-gpu/tensorflow-learning/data/imdb/imdb_word_index.json'
word_index = get_word_index(path=path)
# 所有编码加3,空出0-3的字符标记的位置
word_index = {k:(v+3) for k,v in word_index.items()} 
# 将空出的标记用于以下字符
word_index["<PAD>"] = 0
word_index["<START>"] = 1 #开始字符
word_index["<UNK>"] = 2 #未知字符
word_index["<UNUSED>"] = 3#未使用字符

# 将字典的键值换位
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# 构建字符标记到字符的转换函数
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
# 将所有评论数据都填充到256大小,不足256的评论后填充word_index["<PAD>"],(本示例中为0)
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                       maxlen=256,
                                                       value=word_index["<PAD>"],
                                                       padding='post')
test_data = keras.preprocessing.sequence.pad_sequences(test_data,
                                                       maxlen=256,
                                                       value=word_index["<PAD>"],
                                                       padding='post')
print(type(train_data))
print(type(train_data[0]))
len(train_data[0]), len(test_data[0])
<class 'numpy.ndarray'>
<class 'numpy.ndarray'>
(256, 256)

2 构建模型

# 不同字符的总数
vocab_size = 10000

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.summary()
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, None, 16)          160000    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
=================================================================
Total params: 160,289
Trainable params: 160,289
Non-trainable params: 0

3 训练模型

# 设定函数的优化算法、损失函数、指标
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
# 划分数据集为训练数据和验证数据
X_dev = train_data[:1000]
X_train = train_data[1000:]

Y_dev = train_labels[:1000]
Y_train = train_labels[1000:]
# 训练模型拟合训练数据
history = model.fit(X_train,Y_train,batch_size=512,epochs=40,
                   validation_data=(X_dev, Y_dev), verbose=1)
Epoch 40/40
24000/24000 [==============================] - 0s 5us/sample - loss: 0.0855 - acc: 0.9766 - val_loss: 0.3204 - val_acc: 0.8870

4 评估模型

loss, metrics = model.evaluate(test_data, test_labels)
print("test loss: ", loss)
print("metrics(acc): ", metrics)
25000/25000 [==============================] - 0s 20us/sample - loss: 0.3762 - acc: 0.8689
test loss:  0.3761545661497116
metrics(acc):  0.86888

5 可视化优化过程

history_dict = history.history
print(len(history_dict['loss']))
print(history_dict.keys())
40
dict_keys(['loss', 'acc', 'val_loss', 'val_acc'])

history.history是字典,存放着40次迭代的过程的数值

import matplotlib.pyplot as plt

acc = history_dict['acc']
val_acc = history_dict['val_acc']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc)+1)

# bo 是 blue dot
plt.plot(epochs, loss, 'bo', label='Training loss')
# b 是 blue line
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend() #放置一个图例
plt.show()

plt.clf() # clear figure
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()

总结:从以上面的曲线可以看出模型过拟合,需要引入过拟合的措施

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值