循环神经网络--中文文档自动归类(tensorflow版)

本文主要讲解使用tensorflow实现中文文档的自动归类,首先总结一下主要的工作流程:
(1)首先也是一样我们需要把文字做一个和数值的一一映射字典,也就是将每一个汉字映射为一个字典,同时对于label我们也是一样构建一个一一映射表
(2)我们构建生成器,生成label和输入数据,
(3)对输入数据进行一个embedding编码,
(3)将embedding得到的结果输入循环神经网络lstm中
(4)输出的结果我们只取最后一个。计算loss,和acc,以及定义optimizer
(5)训练模型
具体代码实现如下:
1 导入相关库

import tensorflow as tf
import pandas as pd
from tensorflow.contrib import rnn
import numpy as np

2 以下是构建一对一映射表

with open('corpus.txt','r') as f:
    corpus = f.readlines()
corpus = ''.join(map(lambda x:x.strip(),corpus))
words = set(corpus)
word_to_int = {value:key+1 for key,value in enumerate(words)}
int_to_word =  dict(zip(word_to_int.values(), word_to_int.keys()))
with open('labels.pkl.txt','r') as f:
    labels = f.readlines()
labels = ''.join(map(lambda x:x.strip(),labels))
label = set(labels)
label_to_int = {value:key+1 for key,value in enumerate(label)}
int_to_label =  dict(zip(label_to_int.values(), label_to_int.keys()))
train_csv = pd.read_csv('./data/train.csv')

3 设置参数

seq_len = 20
batch_size = 64
vocab_size = len(int_to_word)
hidden_size = 300
embed_dim = 300
#lr = 0.01
label_size = len(int_to_label)
num_layer = 1
epochs = 100
n_batch = train_csv.shape[0]//batch_size
show_every_epoch = 10

4 生成batch

def transform(value):

    values_int = list(map(word_to_int.get,value))

    if len(values_int) >= seq_len:
        values_int = values_int[:seq_len]
        return values_int
    else:
        values_int = values_int + [0]*(seq_len-len(values_int))
        return values_int

def gen_batch(train_csv,n_batch,batch_size):
    for i in range(n_batch):

        x_s = train_csv.loc[i*batch_size:(i+1)*batch_size-1,'text']
        y_s = train_csv.loc[i*batch_size:(i+1)*batch_size-1,'label']

        x_batch = np.array(list(map(transform,x_s)))
        y_batch = np.array(list(map(label_to_int.get,y_s)))

        yield x_batch,y_batch

6 构建rnn网络

x_input = tf.placeholder(shape=[None,None],dtype=tf.int32,name='input')
y_output = tf.placeholder(shape=[None],dtype=tf.int32,name='label')
def get_init_cell(hidden_size,batch_size,num_layer):

    cell = []
    for _ in range(num_layer):
        cell.append(rnn.BasicLSTMCell(hidden_size))
    lstm_cell = rnn.MultiRNNCell(cell)
    init_state = lstm_cell.zero_state(batch_size,tf.float32)
    return lstm_cell,init_state
def bulit_rnn(x_input,hidden_size,batch_size,num_layer,embed_dim,vocab_size,seq_len):

    lstm_cell,init_state = get_init_cell(hidden_size,batch_size,num_layer)
    weight_emb = tf.Variable(tf.random_uniform([vocab_size,hidden_size],-1,1),name='weight_emb')
    embed = tf.nn.embedding_lookup(weight_emb,x_input)
    #embed = tf.contrib.layers.embed_sequence(x_input,vocab_size,hidden_size)
    #embed = tf.squeeze(embed)
    x = tf.unstack(embed, seq_len, 1)


    lstm_out,final_states = tf.nn.static_rnn(lstm_cell,inputs=x,initial_state=init_state,dtype=tf.float32)

    return lstm_out,final_states,embed


lstm_out,final_states,embed = bulit_rnn(x_input,hidden_size,batch_size,num_layer,embed_dim,vocab_size,seq_len)

logits = tf.contrib.layers.fully_connected(lstm_out[-1],label_size,activation_fn=None)
prob = tf.nn.softmax(logits)
#y_true = tf.one_hot(y_output,depth=len(label_to_int))
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits,labels=y_output))
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.argmax(prob,1),tf.cast(y_output,tf.int64)),tf.float32))
optimizer = tf.train.AdamOptimizer().minimize(loss)
with tf.Session() as sess:

    sess.run(tf.global_variables_initializer())
    writer = tf.summary.FileWriter('./graphs/class_dcument',sess.graph)
    for epoch in range(epochs):
        train_loss = 0
        train_acc = 0
        for x_batch,y_batch in gen_batch(train_csv,n_batch,batch_size):

            _,tmp_acc,tmp_loss= sess.run([optimizer,accuracy,loss],feed_dict={x_input:x_batch,y_output:y_batch})

            train_loss += tmp_loss

            train_acc +=  tmp_acc

        if epoch % show_every_epoch == 0:

            print('Epoch {}/{} train_loss {:.3f},train_acc {:.3f}'.format(epoch,epochs,train_loss/n_batch,\
                                                                  train_acc/n_batch))
        saver = tf.train.Saver()
        saver.save(sess, './checkpoints/')

        writer.close()  

训练结果如下:
Epoch 0/100 train_loss 0.536,train_acc 0.820
Epoch 10/100 train_loss 0.091,train_acc 0.968
Epoch 20/100 train_loss 0.044,train_acc 0.984
Epoch 30/100 train_loss 0.035,train_acc 0.987
Epoch 40/100 train_loss 0.031,train_acc 0.989
Epoch 50/100 train_loss 0.030,train_acc 0.989
Epoch 60/100 train_loss 0.028,train_acc 0.990
Epoch 70/100 train_loss 0.029,train_acc 0.990
Epoch 80/100 train_loss 0.030,train_acc 0.989
Epoch 90/100 train_loss 0.033,train_acc 0.988

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值