数据集探索
IMDB数据集
IMDB数据集是一个大规模影评数据集, 其中25K条有标注数据用于训练, 25K条有标注数据用于测试. 使用TensorFlow提供的教程可以快速使用IMDB数据集进行文本分类任务的学习.
基础文本分类任务
数据集准备
- 使用TensorFlow相关数据集工具下载IMDB数据集. 如果已经下载了IMDB对应的npy文件, 可以在
imdb.load_data()
通过第一个参数指定.
# import tensorflow
import numpy as np
import tensorflow as tf
from tensorflow import keras
print(tf.__version__)
# download and load IMDB dataset
imdb = keras.datasets.imdb
(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)
# view dataset size and samples
print('Traning entries: {}, labels: {}'.format(
len(train_data), len(train_labels)
))
print(train_data[0])
print(len(train_data[0]), len(train_data[1]))
- 下载并调整词典.
# download and load word and index pairs
word_index = imdb.get_word_index()
# add special tokens
word_index = {k:(v+3) for (k, v) in word_index.items()}
word_index['<PAD>'] = 0
word_index['<START>'] = 1
word_index['<UNK>'] = 2
word_index['<UNUSED>'] = 3
# get index->word mapping
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# decode sample to text
def decode_review(text):
return ' '.join([reverse_word_index.get(i, '?') for i in text])
# view a origin text
decode_review(train_data[20])
- Padding得到统一长度的数据.
train_data = keras.preprocessing.sequence.pad_sequences(
train_data,
value=word_index["<PAD>"],
padding='post',
maxlen=256
)
test_data = keras.preprocessing.sequence.pad_sequences(
test_data,
value=word_index["<PAD>"],
padding='post',
maxlen=256
)
print(len(train_data[0]), len(train_data[1]))
print(train_data[0])
模型准备
- 定义简单分类模型: 将输入文档所有词的词向量通过
Embedding
层取出, 求得平均后得到文档向量, 使用2层全连接层(Dense
)进行分类.
vocab_size=10000
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 32))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
model.summary()
- 为模型指定优化器, 损失函数和日志信息类型
model.compile(optimizer=tf.train.AdamOptimizer(),
loss='binary_crossentropy',
metrics=['accuracy'])
# calculate F1 score after each epoch
import numpy as np
from keras.callbacks import Callback
from keras.engine.training import Model
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score
class Metrics(Callback):
def on_train_begin(self, logs={}):
self.val_f1s = []
self.val_recalls = []
self.val_precisions = []
def on_epoch_end(self, epoch, logs={}):
val_predict = (np.asarray(self.model.predict(self.validation_data[0]))).round()
val_targ = self.validation_data[1]
_val_f1 = f1_score(val_targ, val_predict,average='weighted')
_val_recall = recall_score(val_targ, val_predict,average='weighted')
_val_precision = precision_score(val_targ, val_predict,average='weighted')
self.val_f1s.append(_val_f1)
self.val_recalls.append(_val_recall)
self.val_precisions.append(_val_precision)
print( ' — val_f1: %f — val_precision: %f — val_recall %f' %(_val_f1, _val_precision, _val_recall))
return
metrics = Metrics()
- 训练模型. 使用
EarlyStopping
避免模型过拟合.
from keras.callbacks import EarlyStopping
earlystopping=keras.callbacks.EarlyStopping(monitor='val_acc', patience=8, verbose=0, mode='max')
history = model.fit(partial_x_train,
partial_y_train,
epochs=120,
batch_size=1024,
validation_data=(x_val, y_val),
callbacks=[metrics,earlystopping],
verbose=1)
结果可视化
- 将训练的历史信息可视化, 主要可视化随着训练过程, 模型在训练集和验证集上的
loss
和accuracy
.
history_dict = history.history
history_dict.keys()
%matplotlib inline
import matplotlib.pyplot as plt
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
figure = plt.figure(figsize=(14, 5))
ax = figure.add_subplot(1, 2, 1)
# "bo" is for "blue dot"
ax.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
ax.plot(epochs, val_loss, 'r', label='Validation loss')
ax.set_title('Training and validation loss')
ax.set_xlabel('Epochs')
ax.set_ylabel('Loss')
ax.legend()
ax = figure.add_subplot(1, 2, 2)
# "bo" is for "blue dot"
ax.plot(epochs, acc, 'bo', label='Training Accuracy')
# b is for "solid blue line"
ax.plot(epochs, val_acc, 'r', label='Validation Accuracy')
ax.set_title('Training and validation accuracy')
ax.set_xlabel('Epochs')
ax.set_ylabel('Accuracy')
ax.legend()
plt.show()
THUCNews数据子集
THUCNews数据集是根据新浪新闻RSS订阅频道2005~2011年间的历史数据筛选过滤生成,包含74万篇新闻文档(2.19 GB),均为UTF-8纯文本格式.我们在原始新浪新闻分类体系的基础上,重新整合划分出14个候选分类类别:财经、彩票、房产、股票、家居、教育、科技、社会、时尚、时政、体育、星座、游戏、娱乐.使用THUCTC工具包在此数据集上进行评测,准确率可以达到88.6%.这里使用一个较小的子集进行实验.
THUCnews数据集处理
原始数据加载
- 加载原始文本数据和标签. 输入数据一行表示一个文本和对应的类别标签, 中间用
\t
符号分隔. 类别标签分为10类:体育, 财经, 房产, 家居, 教育, 科技, 时尚, 时政, 游戏, 娱乐
.
import os
import sys
from collections import Counter
import numpy as np
import tensorflow.keras as keras
import tensorflow as tf
data_root = './cnews/'
def open_file(filename, model='r'):
return open(filename, model, encoding='utf-8')
def read_file(filename):
contents, labels = [], []
with open_file(filename) as f:
for line in f:
try:
label, content = line.strip().split('\t')
if content:
contents.append(list(content))
labels.append(label)
except:
pass
return contents, labels
contents, labels = read_file(os.path.join(data_root, 'cnews.train.txt'))
print(len(contents), len(labels))
print(contents[0])
print(len(contents[0]), len(contents[1]))
- 生成词表. 使用训练数据集生成词表.
def build_vocab(train_dir, vocab_dir, vocab_size=5000):
data_train, _ = read_file(train_dir)
all_data = []
for content in data_train:
all_data.extend(content)
counter = Counter(all_data)
count_pairs = counter.most_common(vocab_size - 1)
words, _ = list(zip(*count_pairs))
words = ['<PAD>'] + list(words)
open_file(vocab_dir, model='w').write('\n'.join(words)+'\n')
def read_vocab(vocab_dir):
with open_file(vocab_dir) as f:
words = [x.strip() for x in f.readlines()]
word_to_id = dict(zip(words, range(len(words))))
return words, word_to_id
build_vocab(os.path.join(data_root, 'cnews.train.txt'),
os.path.join(data_root, 'train_vocab.txt'),
5000)
words,word_to_id = read_vocab(os.path.join(data_root, 'train_vocab.txt'))
print(len(words), len(word_to_id))
for w in words[:10]:
print(w, word_to_id[w])
- 进行类别映射. 将10个类别映射到0~9.
def read_category():
categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
categories = [x for x in categories]
cat_to_id = dict(zip(categories, range(len(categories))))
return categories, cat_to_id
利用原始数据和词表生成训练数据
- 生成单个数据集的训练数据. 主要将文本中每个字符映射为词表中的下标, 类别从类别名映射为对应的下标.
def process_file(filename, word_to_id, cat_to_id, max_len=600):
contents, labels = read_file(filename)
data_id, label_id = [], []
for i in range(len(contents)):
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
label_id.append(cat_to_id[labels[i]])
x_pad = keras.preprocessing.sequence.pad_sequences(data_id, max_len)
y_pad = keras.utils.to_categorical(label_id, num_classes=len(cat_to_id))
return x_pad, y_pad
words, word_to_id = read_vocab(os.path.join(data_root, 'train_vocab.txt'))
categories, cat_to_id = read_category()
max_len = 600
x, y = process_file(os.path.join(data_root, 'cnews.train.txt'),
word_to_id, cat_to_id, max_len)
print(len(x), len(y))
print(len(x[0]), len(x[1]))
print(x[0])
- 为重复使用数据集, 将数据集预处理好后存储到硬盘.
build_vocab(os.path.join(data_root, 'cnews.train.txt'),
os.path.join(data_root, 'train_vocab.txt'),
5000)
words,word_to_id = read_vocab(os.path.join(data_root, 'train_vocab.txt'))
print(len(words), len(word_to_id))
for w in words[:10]:
print(w, word_to_id[w])
categories, cat_to_id = read_category()
max_len = 600
for data in ['train', 'val', 'test']:
x, y = process_file(os.path.join(data_root, 'cnews.{}.txt'.format(data)),
word_to_id, cat_to_id, max_len)
np.savez(os.path.join(data_root, '{}.npz'.format(data)), x=x, y=y)
训练模型并可视化
- 使用IMDB同样的模型进行训练. 词表修改为5000. 这一简单模型在验证数据及上Accuracy也可以达到0.92
# build model
vocab_size=5000
model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 64))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(32, activation=tf.nn.relu))
model.add(keras.layers.Dropout(0.5))
model.add(keras.layers.Dense(len(cat_to_id), activation=tf.nn.sigmoid))
model.summary()
model.compile(optimizer=tf.train.AdamOptimizer(),
loss='categorical_crossentropy',
metrics=['accuracy'])
# load datasets
def load_npz(filename):
data = np.load(filename)
return data['x'], data['y']
def load_dataset(data, max_len=600):
words,word_to_id = read_vocab(os.path.join(data_root, 'train_vocab.txt'))
print(len(words), len(word_to_id))
for w in words[:10]:
print(w, word_to_id[w])
categories, cat_to_id = read_category()
x, y = process_file(os.path.join(data_root, 'cnews.{}.txt'.format(data)),
word_to_id, cat_to_id, max_len)
return x, y
x_train, y_train = load_npz(os.path.join(data_root, 'train.npz'))
x_val, y_val = load_npz(os.path.join(data_root, 'val.npz'))
x_test, y_test = load_npz(os.path.join(data_root, 'test.npz'))
print(len(x_train), len(y_train))
print(len(x_val), len(y_val))
print(len(x_test), len(y_test))
# train model
from keras.callbacks import EarlyStopping
earlystopping=keras.callbacks.EarlyStopping(monitor='val_acc', patience=8, verbose=0, mode='max')
history = model.fit(x_train,
y_train,
epochs=120,
batch_size=1024,
validation_data=(x_val, y_val),
callbacks=[earlystopping],
verbose=1)
- 可视化结果
history_dict = history.history
history_dict.keys()
%matplotlib inline
import matplotlib.pyplot as plt
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
figure = plt.figure(figsize=(14, 5))
ax = figure.add_subplot(1, 2, 1)
# "bo" is for "blue dot"
ax.plot(epochs, loss, 'bo', label='Training loss')
# b is for "solid blue line"
ax.plot(epochs, val_loss, 'r', label='Validation loss')
ax.set_title('Training and validation loss')
ax.set_xlabel('Epochs')
ax.set_ylabel('Loss')
ax.legend()
ax = figure.add_subplot(1, 2, 2)
# "bo" is for "blue dot"
ax.plot(epochs, acc, 'bo', label='Training Accuracy')
# b is for "solid blue line"
ax.plot(epochs, val_acc, 'r', label='Validation Accuracy')
ax.set_title('Training and validation accuracy')
ax.set_xlabel('Epochs')
ax.set_ylabel('Accuracy')
ax.legend()
plt.show()