demo7_cnn.py
import os,sys,time
from datetime import timedelta
import numpy as np
import tensorflow as tf
from tensorflow.python.client import device_lib
from sklearn import metrics
from tqdm import tqdm,trange
from grammer.cnn_model import *
from grammer.cnews_loader import *
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
gpu_config = tf.ConfigProto()
gpu_config.gpu_options.allow_growth = True
base_dir = './data/demo7_cnn'
train_dir = os.path.join(base_dir, 'cnews.train.txt')
test_dir = os.path.join(base_dir, 'cnews.test.txt')
val_dir = os.path.join(base_dir, 'cnews.val.txt')
vocab_dir = os.path.join(base_dir, 'cnews.vocab.txt')
save_dir = './data/demo7_cnn/checkpoints/textcnn'
save_path = os.path.join(save_dir, 'best_validation')
def get_time_dif(start_time):
end_time = time.time()
time_dif = end_time - start_time
return timedelta(seconds=int(round(time_dif)))
def feed_data(x_batch, y_batch, keep_prob):
feed_dict = {
model.input_x: x_batch,
model.input_y: y_batch,
model.keep_prob: keep_prob
}
return feed_dict
def evaluate(sess, x_, y_):
data_len = len(x_)
batch_eval = batch_iter(x_, y_, 128)
total_loss = 0.0
total_acc = 0.0
for x_batch, y_batch in batch_eval:
batch_len = len(x_batch)
feed_dict = feed_data(x_batch, y_batch, 1.0)
loss, acc = sess.run([model.loss, model.acc], feed_dict=feed_dict)
total_loss += loss * batch_len
total_acc += acc * batch_len
return total_loss / data_len, total_acc / data_len
def train():
print("Configuring TensorBoard and Saver...")
tensorboard_dir = './data/demo7_cnn/tensorboard/textcnn'
if not os.path.exists(tensorboard_dir):
os.makedirs(tensorboard_dir)
tf.summary.scalar('loss', model.loss)
tf.summary.scalar('accuracy', model.acc)
merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter(tensorboard_dir)
saver = tf.train.Saver()
if not os.path.exists(save_dir):
os.makedirs(save_dir)
print("Loading training and validation data...")
start_time = time.time()
x_train, y_train = process_file(train_dir, word_to_id, cat_to_id, config.seq_length)
x_val, y_val = process_file(val_dir, word_to_id, cat_to_id, config.seq_length)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
session = tf.Session(config=gpu_config)
session.run(tf.global_variables_initializer())
writer.add_graph(session.graph)
print('Training and evaluating...')
start_time = time.time()
total_batch = 0
best_acc_val = 0.0
last_improved = 0
require_improvement = 1000
flag = False
for epoch in range(config.num_epochs):
print('Epoch:', epoch + 1)
batch_train = batch_iter(x_train, y_train, config.batch_size)
for x_batch, y_batch in batch_train:
feed_dict = feed_data(x_batch, y_batch, config.dropout_keep_prob)
if total_batch % config.save_per_batch == 0:
s = session.run(merged_summary, feed_dict=feed_dict)
writer.add_summary(s, total_batch)
if total_batch % config.print_per_batch == 0:
feed_dict[model.keep_prob] = 1.0
loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
loss_val, acc_val = evaluate(session, x_val, y_val)
if acc_val > best_acc_val:
best_acc_val = acc_val
last_improved = total_batch
saver.save(sess=session, save_path=save_path)
improved_str = '*'
else:
improved_str = ''
time_dif = get_time_dif(start_time)
msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
+ ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}'
session.run(model.optim, feed_dict=feed_dict)
total_batch += 1
if (total_batch - last_improved) > require_improvement or acc_val > 0.98:
print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val))
print("No optimization for a long time, auto-stopping...")
flag = True
break
if flag:
break
def test():
print("Loading test data...")
start_time = time.time()
x_test, y_test = process_file(test_dir, word_to_id, cat_to_id, config.seq_length)
session = tf.Session(config=gpu_config)
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(sess=session, save_path=save_path)
print('Testing...')
loss_test, acc_test = evaluate(session, x_test, y_test)
msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
print(msg.format(loss_test, acc_test))
batch_size = 128
data_len = len(x_test)
num_batch = int((data_len - 1) / batch_size) + 1
y_test_cls = np.argmax(y_test, 1)
y_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)
for i in range(num_batch):
start_id = i * batch_size
end_id = min((i + 1) * batch_size, data_len)
feed_dict = {
model.input_x: x_test[start_id:end_id],
model.keep_prob: 1.0
}
y_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)
print("Precision, Recall and F1-Score...")
print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))
print("Confusion Matrix...")
cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
print(cm)
time_dif = get_time_dif(start_time)
print("Time usage:", time_dif)
if __name__ == '__main__':
config = TCNNConfig()
if not os.path.exists(vocab_dir):
build_vocab(train_dir, vocab_dir, config.vocab_size)
categories, cat_to_id = read_category()
words, word_to_id = read_vocab(vocab_dir)
config.vocab_size = len(words)
model = TextCNN(config)
option = 'test'
if option.strip() == 'train':
train()
else:
test()
cnn_model.py
import tensorflow as tf
class TCNNConfig(object):
embedding_dim = 64
seq_length = 600
num_classes = 10
num_filters = 256
kernel_size = 5
vocab_size = 5000
hidden_dim = 128
dropout_keep_prob = 0.5
learning_rate = 1e-3
batch_size = 64
num_epochs = 10
print_per_batch = 100
save_per_batch = 10
class TextCNN(object):
def __init__(self, config):
self.config = config
self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
self.input_y = tf.placeholder(tf.float32, [None, self.config.num_classes], name='input_y')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.cnn()
def cnn(self):
with tf.device('/gpu:0'):
embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim])
embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
with tf.name_scope('cnn'):
conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv')
gmp = tf.reduce_mean(conv, reduction_indices=[1], name='gmp')
with tf.name_scope('score'):
fc = tf.layers.dense(gmp, self.config.hidden_dim, name='f1')
fc = tf.contrib.layers.dropout(fc, self.keep_prob)
fc = tf.nn.relu(fc)
self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2')
self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1)
with tf.name_scope('optimizer'):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(logits=self.logits, labels=self.input_y)
self.loss = tf.reduce_mean(cross_entropy)
self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
with tf.name_scope('accuracy'):
correct_pred = tf.equal(tf.argmax(self.input_y,1), self.y_pred_cls)
self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
cnews_loader.py
import sys
from collections import Counter
import numpy as np
import tensorflow.contrib.keras as kr
def read_file(filename):
"""读取文件数据"""
contents, labels = [], []
with open(filename,'r',encoding='utf-8',errors='ignore') as f:
for line in f:
try:
label, content = line.strip().split('\t')
if content:
contents.append(list(content))
labels.append(label)
except:
pass
return contents, labels
def build_vocab(train_dir, vocab_dir, vocab_size=5000):
"""根据训练集构建词汇表,存储"""
data_train, _ = read_file(train_dir)
all_data = []
for content in data_train:
all_data.extend(content)
counter = Counter(all_data)
count_pairs = counter.most_common(vocab_size - 1)
words, _ = list(zip(*count_pairs))
words = ['<PAD>'] + list(words)
open(vocab_dir, 'w', encoding='utf-8',errors='ignore').write('\n'.join(words)+'\n')
def read_vocab(vocab_dir):
"""读取词汇表"""
with open(vocab_dir,'r',encoding='utf-8',errors='ignore') as fp:
words = [ _.strip() for _ in fp.readlines()]
word_to_id = dict(zip(words, range(0, len(words))))
return words, word_to_id
def read_category():
"""读取分类目录,固定"""
categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
cat_to_id = dict(zip(categories, range(0, len(categories))))
return categories, cat_to_id
def process_file(filename, word_to_id, cat_to_id, max_length=600):
"""将文件转换为id表示"""
contents, labels = read_file(filename)
data_id, label_id = [], []
for i in range(len(contents)):
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
label_id.append(cat_to_id[labels[i]])
x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id))
return x_pad, y_pad
def batch_iter(x, y, batch_size=64):
"""生成批次数据"""
data_len = len(x)
num_batch = int((data_len - 1) / batch_size) + 1
indices = np.random.permutation(np.arange(data_len))
x_shuffle = x[indices]
y_shuffle = y[indices]
for i in range(num_batch):
start_id = i * batch_size
end_id = min((i + 1) * batch_size, data_len)
yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]