代码主要参考这位大神的博客,这边说的比较清楚,http://blog.csdn.net/u011439796/article/details/77692621
tensorflow版本:
In [33]: tf.__version__
Out[33]: '1.2.1'
首先是数据获取:
curl -O "http://thuctc.thunlp.org/source/THUCNews.zip"
数据主要是财经、彩票、房产、股票、家居等相关主题的数据
现在之后用unzip 解压既可以,再抽取10个主题的数据,参考的博客是每个主题抽取6500条数据,其中每个主题5000用于训练,1000用于测试,500用于验证
抽取数据的python代码如下,原博客写了个shell脚本,我用python写了个:
import os
import glob
import shutil
import random
basepath="/Users/shuubiasahi/Desktop/THUCNews/"
newpath="/Users/shuubiasahi/Desktop/tensorflow/text/"
listpath=list(map(lambda x:basepath+str(x)+"/",list(filter(lambda x:not str(x).startswith("."),os.listdir(basepath)))))
def copy(listpath,MAXCOUNT=6500):
for path in listpath:
newdir=newpath+ str(path).split("/")[-2]
print(newdir)
if not os.path.exists(newdir):
os.mkdir(newdir)
files=glob.glob(path+"*.txt")
if len(files)<MAXCOUNT:
resultlist=[]
for i in range(MAXCOUNT):
resultlist.append(random.choice(files))
else:
resultlist=random.sample(files,MAXCOUNT)
for file in resultlist:
shutil.copy(file,newdir)
if __name__=='__main__':
copy(listpath)
print("抽取成功")
把数据整合到一个文件里面去,格式如下:
标签+“\t”+实际文本内容
分为训练集合测试集、验证集合,涉及到代码如下:
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
将文本整合到 train、test、val 三个文件中
"""
import os
basepath="/Users/shuubiasahi/Desktop/tensorflow/text/"
trainpath="/Users/shuubiasahi/Desktop/tensorflow/train/"
def _read_file(filename):
with open(filename,'r',encoding='utf-8') as f:
"""
u3000中文全角下的空格
"""
return f.read().replace('\n','').replace('\t','').replace('\u3000','')
def save_file(dirname):
"""
将多个文件整合并存到3个文件中
dirname: 原数据目录
文件内容格式: 类别\t内容
"""
f_train=open(trainpath+"cnews.train.txt",'w',encoding='utf-8')
f_test = open(trainpath + "cnews.test.txt", 'w', encoding='utf-8')
f_val = open(trainpath + "cnews.val.txt", 'w', encoding='utf-8')
for category in os.listdir(dirname):
catdir=os.path.join(dirname,category)
if not os.path.isdir(catdir):
continue
files=os.listdir(catdir)
print(len(files))
count=0
for cur_file in files:
filename=os.path.join(catdir,cur_file)
content=_read_file(filename)
if count<5000:
f_train.write(category+"\t"+content+"\n")
elif count<6000:
f_test.write(category+"\t"+content+"\n")
else:
f_val.write(category + '\t' + content + '\n')
count+=1
print("finish:",category)
f_train.close()
f_test.close()
f_val.close()
if __name__=='__main__':
save_file(basepath)
print(len(open(trainpath+"cnews.train.txt", 'r', encoding='utf-8').readlines()))
print(len(open(trainpath + "cnews.test.txt", 'r', encoding='utf-8').readlines()))
print(len(open(trainpath + "cnews.val.txt", 'r', encoding='utf-8').readlines()))
预处理代码说明:
read_file():读取上一部分生成的数据文件,将内容和标签分开返回;
_build_vocab(): 构建词汇表,这里不需要对文档进行分词,单字的效果已经很好,这一函数会将词汇表存储下来,避免每一次重复处理;
_read_vocab(): 读取上一步存储的词汇表,转换为{词:id}表示;
_read_category(): 将分类目录固定,转换为{类别: id}表示;
_file_to_ids(): 基于上面定义的函数,将数据集从文字转换为id表示;
to_words(): 将一条由id表示的数据重新转换为文字;
preocess_file(): 一次性处理所有的数据并返回;
batch_iter(): 为神经网络的训练准备批次的数据。
#!/usr/bin/python
# -*- coding: utf-8 -*-
from collections import Counter
import tensorflow.contrib.keras as kr
import numpy as np
import os
trainpath="/Users/shuubiasahi/Desktop/tensorflow/train/"
def _read_file(filename):
"""读取文件数据"""
counters=[]
labels=[]
with open(filename,'r',encoding='utf-8') as f:
for line in f.readlines():
try:
label,contet=line.strip().split('\t')
counters.append(list(contet))
labels.append(label)
except Exception as e:
pass
return counters,labels
def build_vocab(filename,vocab_size=5000):
data,_=_read_file(filename)
all_data=[]
for content in data:
all_data.extend(content)
counter=Counter(all_data)
count_parirs=counter.most_common(vocab_size-1)
words,_=list(zip(*count_parirs))
# 添加一个 <PAD> 来将所有文本pad为同一长度
words = ['<PAD>'] + list(words)
open(trainpath+'vocab_cnews.txt', 'w',
encoding='utf-8').write('\n'.join(words))
def _read_vocab(filename):
"""读取词汇列别"""
words=list(map(lambda line:line.strip(),open(filename,'r',encoding='utf-8').readlines()))
word_to_id=dict(zip(words,range(len(words))))
return words,word_to_id
def _read_category():
"""读取分类目录,固定"""
categories=["财经","彩票","房产","股票",
"家居","教育","科技","社会","时尚","体育"]
cat_to_id=dict(zip(categories,range(len(categories))))
return categories,cat_to_id
def to_words(content,words):
"""降id表示的内容转换成文字"""
return ''.join(words[x] for x in content)
def _file_to_ids(filename,word_to_id,max_len=600):
"""将文件转换为id表示"""
_,cat_to_id=_read_category()
contents,labels=_read_file(filename)
data_id=[]
label_id=[]
for i in range(len(contents)):
data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
label_id.append(cat_to_id[labels[i]])
# 使用keras提供的pad_sequences来将文本pad为固定长度
x_pad=kr.preprocessing.sequence.pad_sequences(data_id,max_len)
y_pad=kr.utils.to_categorical(label_id)
return x_pad,y_pad
def preocess_file(data_path="/Users/shuubiasahi/Desktop/tensorflow/train/",seq_length=600):
"""一次性返回所有的数据"""
words,word_to_id=_read_vocab(os.path.join(data_path,'vocab_cnews.txt'))
x_train,y_train=_file_to_ids(os.path.join(data_path,"cnews.train.txt"),word_to_id,seq_length)
x_test,y_test=_file_to_ids(os.path.join(data_path,
'cnews.test.txt'), word_to_id, seq_length)
x_val, y_val = _file_to_ids(os.path.join(data_path,
'cnews.val.txt'), word_to_id, seq_length)
return x_train, y_train, x_test, y_test, x_val, y_val,words
def batch_iter(data,batch_size=64,num_epochs=5):
"""生成批次数据"""
data=np.array(data)
data_size=len(data)
num_batchs_per_epchs=int((data_size-1)/batch_size)+1
for epoch in range(num_epochs):
indices=np.random.permutation(np.arange(data_size))
shufflfed_data=data[indices]
for batch_num in range(num_batchs_per_epchs):
start_index=batch_num*batch_size
end_index=min((batch_num + 1) * batch_size, data_size)
yield shufflfed_data[start_index:end_index]
if __name__=='__main__':
if not os.path.exists(os.path.join(trainpath,"vocab_cnews.txt")):
build_vocab(trainpath+'cnews.train.txt')
x_train, y_train, x_test, y_test, x_val, y_val,words = preocess_file()
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
print(x_val.shape, y_val.shape)
lstm、gru、cnn相关参数配置:
#!/usr/bin/python
# -*- coding: utf-8 -*-
class TCNNConfig(object):
# 模型参数
embedding_dim = 64 # 词向量维度
seq_length = 600 # 序列长度
num_classes = 10 # 类别数
num_filters = 256 # 卷积核数目
kernel_size = 5 # 卷积核尺寸
vocab_size = 5000 # 词汇表达小
hidden_dim = 128 # 全连接层神经元
dropout_keep_prob = 0.8 # dropout保留比例
learning_rate = 1e-3 # 学习率
batch_size = 128 # 每批训练大小
num_epochs = 10 # 总迭代轮次
print_per_batch = 100 # 每多少轮输出一次结果
class TRNNConfig(object):
"""RNN配置参数"""
# 模型参数
embedding_dim = 64 # 词向量维度
seq_length = 600 # 序列长度
num_classes = 10 # 类别数
vocab_size = 5000 # 词汇表达小
num_layers= 2 # 隐藏层层数
hidden_dim = 128 # 隐藏层神经元
rnn = 'gru' # lstm 或 gru
dropout_keep_prob = 0.8 # dropout保留比例
learning_rate = 1e-3 # 学习率
batch_size = 128 # 每批训练大小
num_epochs = 100 # 总迭代轮次
print_per_batch = 10 # 每多少轮输出一次结果
cnn代码:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import tensorflow as tf
class TextCNN(object):
"""文本分类,cnn模型"""
def __init__(self,config):
self.config=config
self.input_x=tf.placeholder(tf.int32,[None,self.config.seq_length],name='input_x')
self.input_y=tf.placeholder(tf.float32,[None,self.config.num_classes],name="input_y")
self.keep_prob=tf.placeholder(tf.float32,name='keep_prob')
self.cnn()
def input_embedding(self):
"""词嵌套"""
with tf.device('/cpu:0'):
embedding=tf.get_variable("embedding",[self.config.vocab_size,self.config.embedding_dim])
_input=tf.nn.embedding_lookup(embedding,self.input_x)
return _input
def cnn(self):
"""cnn模型"""
embedding_inputs=self.input_embedding()
with tf.name_scope("cnn"):
# cnn与全局最大池化
conv=tf.layers.conv1d(embedding_inputs,self.config.num_filters,self.config.kernel_size,name="conv")
#global max pooling
gmp=tf.reduce_max(conv,reduction_indices=[1],name='gmp')
with tf.name_scope("score"):
# 全连接层,后面接dropout以及relu激活
fc=tf.layers.dense(gmp,self.config.hidden_dim,name='fc1')
fc=tf.contrib.layers.dropout(fc, self.keep_prob)
fc=tf.nn.relu(fc)
#分类器
self.logits=tf.layers.dense(fc,self.config.num_classes,name='fc2')
self.pred_y=tf.nn.softmax(self.logits)
with tf.name_scope("loss"):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
logits=self.logits, labels=self.input_y)
self.loss = tf.reduce_mean(cross_entropy)
with tf.name_scope("optimize"):
# 优化器
optimizer = tf.train.AdamOptimizer(
learning_rate=self.config.learning_rate)
self.optim = optimizer.minimize(self.loss)
with tf.name_scope("accuracy"):
# 准确率
correct_pred = tf.equal(tf.argmax(self.input_y, 1),
tf.argmax(self.pred_y, 1))
self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
rnn代码:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import tensorflow as tf
class TextRNN(object):
"""文本分类,RNN模型"""
def __init__(self, config):
self.config = config
self.input_x = tf.placeholder(tf.int32,
[None, self.config.seq_length], name='input_x')
self.input_y = tf.placeholder(tf.float32,
[None, self.config.num_classes], name='input_y')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
self.rnn()
def input_embedding(self):
"""词嵌入"""
with tf.device('/cpu:0'):
embedding = tf.get_variable('embedding',
[self.config.vocab_size, self.config.embedding_dim])
_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
return _inputs
def rnn(self):
"""rnn模型"""
def lstm_cell():
"""lstm核"""
return tf.nn.rnn_cell.BasicLSTMCell(self.config.hidden_dim,
state_is_tuple=True)
def gru_cell():
"""gru核"""
return tf.nn.rnn_cell.GRUCell(self.config.hidden_dim)
def dropout():
"""为每一个rnn核后面加一个dropout层"""
if (self.config.rnn == 'lstm'):
cell = lstm_cell()
else:
cell = gru_cell()
return tf.contrib.rnn.DropoutWrapper(cell,
output_keep_prob=self.keep_prob)
embedding_inputs = self.input_embedding()
with tf.name_scope("rnn"):
# 多层rnn网络
cells = [dropout() for _ in range(self.config.num_layers)]
rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
_outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell,
inputs=embedding_inputs, dtype=tf.float32)
last = _outputs[:, -1, :] # 取最后一个时序输出作为结果
with tf.name_scope("score"):
# 全连接层,后面接dropout以及relu激活
fc = tf.layers.dense(last, self.config.hidden_dim, name='fc1')
fc = tf.contrib.layers.dropout(fc, self.keep_prob)
fc = tf.nn.relu(fc)
# 分类器
self.logits = tf.layers.dense(fc, self.config.num_classes,
name='fc2')
self.pred_y = tf.nn.softmax(self.logits)
with tf.name_scope("loss"):
# 损失函数,交叉熵
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
logits=self.logits, labels=self.input_y)
self.loss = tf.reduce_mean(cross_entropy)
with tf.name_scope("optimize"):
# 优化器
optimizer = tf.train.AdamOptimizer(
learning_rate=self.config.learning_rate)
self.optim = optimizer.minimize(self.loss)
with tf.name_scope("accuracy"):
# 准确率
correct_pred = tf.equal(tf.argmax(self.input_y, 1),
tf.argmax(self.pred_y, 1))
self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
运行代码:
#!/usr/bin/python
# -*- coding: utf-8 -*-
from tensorflowexcise.cnn_model import TextCNN
from tensorflowexcise.rnn_model import TextRNN
from tensorflowexcise.configuration import TCNNConfig
from tensorflowexcise.configuration import TRNNConfig
from tensorflowexcise.cnews_loader import preocess_file,batch_iter,build_vocab
import time
import tensorflow as tf
import os
from datetime import timedelta
trainpath="/Users/shuubiasahi/Desktop/tensorflow/train/"
def run_epoch(cnn=True):
# 载入数据
print('Loading data...')
start_time = time.time()
if not os.path.exists(trainpath+'vocab_cnews.txt'):
build_vocab(trainpath+'cnews.train.txt')
x_train, y_train, x_test, y_test, x_val, y_val, words = preocess_file()
if cnn:
print('Using CNN model...')
config = TCNNConfig()
config.vocab_size = len(words)
model = TextCNN(config)
tensorboard_dir = '/Users/shuubiasahi/Desktop/tensorflow/board.log'
else:
print('Using RNN model...')
config = TRNNConfig()
config.vocab_size = len(words)
model = TextRNN(config)
tensorboard_dir = '/Users/shuubiasahi/Desktop/tensorflow/board.log'
end_time = time.time()
time_dif = end_time - start_time
time_dif = timedelta(seconds=int(round(time_dif)))
print('Time usage:', time_dif)
print('Constructing TensorFlow Graph...')
session = tf.Session()
session.run(tf.global_variables_initializer())
# 配置 tensorboard
tf.summary.scalar("loss", model.loss)
tf.summary.scalar("accuracy", model.acc)
if not os.path.exists(tensorboard_dir):
os.makedirs(tensorboard_dir)
merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter(tensorboard_dir)
writer.add_graph(session.graph)
# 生成批次数据
print('Generating batch...')
batch_train = batch_iter(list(zip(x_train, y_train)),
config.batch_size, config.num_epochs)
def feed_data(batch):
"""准备需要喂入模型的数据"""
x_batch, y_batch = zip(*batch)
feed_dict = {
model.input_x: x_batch,
model.input_y: y_batch
}
return feed_dict, len(x_batch)
def evaluate(x_, y_):
"""
模型评估
一次运行所有的数据会OOM,所以需要分批和汇总
"""
batch_eval = batch_iter(list(zip(x_, y_)), 128, 1)
total_loss = 0.0
total_acc = 0.0
cnt = 0
for batch in batch_eval:
feed_dict, cur_batch_len = feed_data(batch)
feed_dict[model.keep_prob] = 1.0
loss, acc = session.run([model.loss, model.acc],
feed_dict=feed_dict)
total_loss += loss * cur_batch_len
total_acc += acc * cur_batch_len
cnt += cur_batch_len
return total_loss / cnt, total_acc / cnt
# 训练与验证
print('Training and evaluating...')
start_time = time.time()
print_per_batch = config.print_per_batch
for i, batch in enumerate(batch_train):
feed_dict, _ = feed_data(batch)
feed_dict[model.keep_prob] = config.dropout_keep_prob
if i % 5 == 0: # 每5次将训练结果写入tensorboard scalar
s = session.run(merged_summary, feed_dict=feed_dict)
writer.add_summary(s, i)
if i % print_per_batch == print_per_batch - 1: # 每200次输出在训练集和验证集上的性能
loss_train, acc_train = session.run([model.loss, model.acc],
feed_dict=feed_dict)
loss, acc = evaluate(x_val, y_val)
# 时间
end_time = time.time()
time_dif = end_time - start_time
time_dif = timedelta(seconds=int(round(time_dif)))
msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},'\
+ ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5}'
print(msg.format(i + 1, loss_train, acc_train, loss, acc, time_dif))
session.run(model.optim, feed_dict=feed_dict) # 运行优化
# 最后在测试集上进行评估
print('Evaluating on test set...')
loss_test, acc_test = evaluate(x_test, y_test)
msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
print(msg.format(loss_test, acc_test))
session.close()
if __name__ == '__main__':
run_epoch(cnn=True)
运行结果:
Iter: 3100, Train Loss: 0.0043, Train Acc: 100.00%, Val Loss: 0.54, Val Acc: 87.42%, Time: 1:36:34
Iter: 3200, Train Loss: 0.0057, Train Acc: 100.00%, Val Loss: 0.55, Val Acc: 87.58%, Time: 1:38:34
Iter: 3300, Train Loss: 0.0068, Train Acc: 100.00%, Val Loss: 0.57, Val Acc: 87.58%, Time: 1:40:36
Iter: 3400, Train Loss: 0.0097, Train Acc: 100.00%, Val Loss: 0.62, Val Acc: 87.16%, Time: 1:42:40
Iter: 3500, Train Loss: 0.0093, Train Acc: 100.00%, Val Loss: 0.58, Val Acc: 87.68%, Time: 1:44:43
Iter: 3600, Train Loss: 0.0047, Train Acc: 100.00%, Val Loss: 0.57, Val Acc: 88.04%, Time: 1:46:45
Iter: 3700, Train Loss: 0.0037, Train Acc: 100.00%, Val Loss: 0.6, Val Acc: 87.74%, Time: 1:48:49
Iter: 3800, Train Loss: 0.0013, Train Acc: 100.00%, Val Loss: 0.6, Val Acc: 87.18%, Time: 1:50:54
Iter: 3900, Train Loss: 0.019, Train Acc: 99.22%, Val Loss: 0.63, Val Acc: 87.08%, Time: 1:52:59
Evaluating on test set...
Test Loss: 0.48, Test Acc: 90.77%