本文主要讲解使用tensorflow实现中文文档的自动归类,首先总结一下主要的工作流程:
(1)首先也是一样我们需要把文字做一个和数值的一一映射字典,也就是将每一个汉字映射为一个字典,同时对于label我们也是一样构建一个一一映射表
(2)我们构建生成器,生成label和输入数据,
(3)对输入数据进行一个embedding编码,
(3)将embedding得到的结果输入循环神经网络lstm中
(4)输出的结果我们只取最后一个。计算loss,和acc,以及定义optimizer
(5)训练模型
具体代码实现如下:
1 导入相关库
import tensorflow as tf
import pandas as pd
from tensorflow.contrib import rnn
import numpy as np
2 以下是构建一对一映射表
with open('corpus.txt','r') as f:
corpus = f.readlines()
corpus = ''.join(map(lambda x:x.strip(),corpus))
words = set(corpus)
word_to_int = {value:key+1 for key,value in enumerate(words)}
int_to_word = dict(zip(word_to_int.values(), word_to_int.keys()))
with open('labels.pkl.txt','r') as f:
labels = f.readlines()
labels = ''.join(map(lambda x:x.strip(),labels))
label = set(labels)
label_to_int = {value:key+1 for key,value in enumerate(label)}
int_to_label = dict(zip(label_to_int.values(), label_to_int.keys()))
train_csv = pd.read_csv('./data/train.csv')
3 设置参数
seq_len = 20
batch_size = 64
vocab_size = len(int_to_word)
hidden_size = 300
embed_dim = 300
#lr = 0.01
label_size = len(int_to_label)
num_layer = 1
epochs = 100
n_batch = train_csv.shape[0]//batch_size
show_every_epoch = 10
4 生成batch
def transform(value):
values_int = list(map(word_to_int.get,value))
if len(values_int) >= seq_len:
values_int = values_int[:seq_len]
return values_int
else:
values_int = values_int + [0]*(seq_len-len(values_int))
return values_int
def gen_batch(train_csv,n_batch,batch_size):
for i in range(n_batch):
x_s = train_csv.loc[i*batch_size:(i+1)*batch_size-1,'text']
y_s = train_csv.loc[i*batch_size:(i+1)*batch_size-1,'label']
x_batch = np.array(list(map(transform,x_s)))
y_batch = np.array(list(map(label_to_int.get,y_s)))
yield x_batch,y_batch
6 构建rnn网络
x_input = tf.placeholder(shape=[None,None],dtype=tf.int32,name='input')
y_output = tf.placeholder(shape=[None],dtype=tf.int32,name='label')
def get_init_cell(hidden_size,batch_size,num_layer):
cell = []
for _ in range(num_layer):
cell.append(rnn.BasicLSTMCell(hidden_size))
lstm_cell = rnn.MultiRNNCell(cell)
init_state = lstm_cell.zero_state(batch_size,tf.float32)
return lstm_cell,init_state
def bulit_rnn(x_input,hidden_size,batch_size,num_layer,embed_dim,vocab_size,seq_len):
lstm_cell,init_state = get_init_cell(hidden_size,batch_size,num_layer)
weight_emb = tf.Variable(tf.random_uniform([vocab_size,hidden_size],-1,1),name='weight_emb')
embed = tf.nn.embedding_lookup(weight_emb,x_input)
#embed = tf.contrib.layers.embed_sequence(x_input,vocab_size,hidden_size)
#embed = tf.squeeze(embed)
x = tf.unstack(embed, seq_len, 1)
lstm_out,final_states = tf.nn.static_rnn(lstm_cell,inputs=x,initial_state=init_state,dtype=tf.float32)
return lstm_out,final_states,embed
lstm_out,final_states,embed = bulit_rnn(x_input,hidden_size,batch_size,num_layer,embed_dim,vocab_size,seq_len)
logits = tf.contrib.layers.fully_connected(lstm_out[-1],label_size,activation_fn=None)
prob = tf.nn.softmax(logits)
#y_true = tf.one_hot(y_output,depth=len(label_to_int))
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=y_output))
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(prob,1),tf.cast(y_output,tf.int64)),tf.float32))
optimizer = tf.train.AdamOptimizer().minimize(loss)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
writer = tf.summary.FileWriter('./graphs/class_dcument',sess.graph)
for epoch in range(epochs):
train_loss = 0
train_acc = 0
for x_batch,y_batch in gen_batch(train_csv,n_batch,batch_size):
_,tmp_acc,tmp_loss= sess.run([optimizer,accuracy,loss],feed_dict={x_input:x_batch,y_output:y_batch})
train_loss += tmp_loss
train_acc += tmp_acc
if epoch % show_every_epoch == 0:
print('Epoch {}/{} train_loss {:.3f},train_acc {:.3f}'.format(epoch,epochs,train_loss/n_batch,\
train_acc/n_batch))
saver = tf.train.Saver()
saver.save(sess, './checkpoints/')
writer.close()
训练结果如下:
Epoch 0/100 train_loss 0.536,train_acc 0.820
Epoch 10/100 train_loss 0.091,train_acc 0.968
Epoch 20/100 train_loss 0.044,train_acc 0.984
Epoch 30/100 train_loss 0.035,train_acc 0.987
Epoch 40/100 train_loss 0.031,train_acc 0.989
Epoch 50/100 train_loss 0.030,train_acc 0.989
Epoch 60/100 train_loss 0.028,train_acc 0.990
Epoch 70/100 train_loss 0.029,train_acc 0.990
Epoch 80/100 train_loss 0.030,train_acc 0.989
Epoch 90/100 train_loss 0.033,train_acc 0.988