LSTM 文本分类模型的实现


1 构建计算图-LSTM模型
2 训练流程代码
3 数据集封装代码
    api: next_batch(batch_size)
4 词表封装代码
    api: sentence_to_id(text_sentence):句子->id
5 类别封装代码
    api: category_to_id(category)

import tensorflow as tf
import numpy as np
import math
import os
import sys


def get_default_params():
        num_embedding_size = 16,#32                 #each Vec's length
        num_timesteps = 50,#600                    #LSTM步长
        num_lstm_nodes = [32, 32], #[64,64]               #每一层的size32 ,2层
        num_lstm_layers = 2,#层数
        num_fc_nodes = 32, #64      #全连接层神经单元数
        batch_size = 100,
        clip_lstm_grads = 1.0,#控制LSTM梯度大小
        learning_rate = 0.001,
        num_word_threshold = 10
hps = get_default_params()

train_file = 'cnews.train.seg.txt'
val_file = 'cnews.val.seg.txt'
test_file = 'cnews.test.seg.txt'
vocab_file = 'cnews.vocab.txt'
category_file = 'cnews.category.txt'
output_folder = 'run_text_rnn'

if not os.path.exists(output_folder):



class Vocab:
    def __init__(self, filename, num_word_threshold):
        self._word_to_id = {
        self._unk = -1
        self._num_word_threshold = num_word_threshold
    def _read_dict(self, filename):
        with open(filename, 'r', encoding='UTF-8') as f:
            lines = f.readlines()
            for line in lines:
                word, frequency = line.strip('\r\n').split('\t')
                frequency = int(frequency)
                if frequency < self._num_word_threshold:
                idx = len(self._word_to_id)
                if word == '<UNK>':
                    self._unk = idx
                self._word_to_id[word] = idx
    def word_to_id(self, word):
        return self._word_to_id.get(word, self._unk)
    def unk(self):
        return self._unk
    def size(self):
        return len(self._word_to_id)
    def sentence_to_id(self, sentence):
        word_ids = [self.word_to_id(cur_word) \
                    for cur_word in sentence.split()]
        return word_ids
vocab = Vocab(vocab_file, hps.num_word_threshold)
vocab_size = vocab.size()'vocab_size: %d' % vocab_size)

test_str = '的 在 了 是'

INFO:tensorflow:vocab_size: 77323
[2, 4, 6, 7]

class CategeoryDict:
    def __init__(self, filename):
        self._categeory_to_id = {
        with open(filename, 'r', encoding='UTF-8') as f:
            lines = f.readlines()
        for line in lines:
            categeory = line.strip('\r\n')
            idx = len(self._categeory_to_id)
            self._categeory_to_id[categeory] = idx
    def categeory_to_id(self, categeory):
        if not categeory in self._categeory_to_id:
            raise Exception(
                "%s is not in our categeory list" % categeory_name)
        return self._categeory_to_id[categeory]
    def size(self):
        return len(self._categeory_to_id)
categeory_vocab = CategeoryDict(category_file)
test_str= '娱乐'
num_classes = categeory_vocab.size()'label: %s,id: %d' % (test_str, categeory_vocab.categeory_to_id(test_str)))'num_classes: %d' % num_classes)

INFO:tensorflow? 娱乐,id: 1
INFO:tensorflow:num_classes: 10

class TextDataSet:
    def __init__(self, filename, vocab, categeory_vocab, num_timesteps):
        self._vocab = vocab
        self._categeory_vocab = categeory_vocab
        self._num_timesteps =num_timesteps
        self._inputs = []#matrxi
        self._outputs = []#Vec
        self._indicator = 0
    def _parse_file(self, filename):'Loading data from %s' , filename)
        with open(filename,'r',encoding='UTF-8') as f:
            lines = f.readlines()
        category_dict = {
        for line in lines:
            label,content = line.strip('\r\n').split('\t') 
            id_label = self._categeory_vocab.categeory_to_id(label)
            id_words = self._vocab.sentence_to_id(content)
            id_words = id_words[0:self._num_timesteps]
            padding_num = self._num_timesteps - len(id_words)
            id_words = id_words + [self._vocab.unk for i in range(padding_num)]
        #转换为numpy array
        self._inputs = np.asarray(self._inputs, dtype=np.int32)
        self._outputs = np.asarray(self._outputs, dtype=np.int32)
    def _random_shuffle(self):
        p = np.random.permutation(len(self._inputs))
        self._inputs = self._inputs[p]
        self._outputs = self._outputs[p]
    def next_batch(self,batch_size):
        end_indicator = self._indicator + batch_size
        if end_indicator > len(self._inputs):
            self._indicator = 0
            end_indicator = batch_size
        if end_indicator > len(self._inputs):
            raise Exception("batch_size: %d is too large" % batch_size)
        batch_inputs = self._inputs[self._indicator:end_indicator]
        batch_outputs = self._outputs[self._indicator:end_indicator]
        self._indicator = end_indicator
        return batch_inputs, batch_outputs

train_dataset = TextDataSet(train_file, vocab, categeory_vocab, hps.num_timesteps)
val_dataset = TextDataSet(val_file, vocab, categeory_vocab, hps.num_timesteps)
test_dataset = TextDataSet(test_file, vocab, categeory_vocab, hps.num_timesteps)


INFO:tensorflow:Loading data from cnews.train.seg.txt
INFO:tensorflow:Loading data from cnews.val.seg.txt
INFO:tensorflow:Loading data from cnews.test.seg.txt
(array([[ 128, 14848, 11, 37, 1575, 1755, 470, 75585, 14123,
1, 37, 2, 1755, 1575, 1071, 7958, 13470, 3,
261, 27, 1, 348, 1762, 255, 863, 293, 1575,
381, 6, 7160, 1, 32, 310, 11870, 1, 226,
27, 667, 539, 122, 6787, 2, 3332, 22879, 3,
268, 225, 5961, 9578, 1170],
[45395, 37, 8057, 5227, 87, 11, 13, 7105, 2,
4060, 17, 467, 16, 4, 5533, 4478, 7105, 2,
4060, 11, 25120, 2349, 11, 0, 5, 40814, 114,
0, 1079, 11, 191, 1382, 29, 8208, 2, 4060,
1, 25120, 2, 1505, 4, 37, 2451, 16530, 1,
7810, 4194, 40, 6958, 40]]), array([4, 4]))
(array([[ 1243, 623, 9001, 70, 27738, 74690, 11411, 1141, 23344,
61, 177, 4, 1243, 623, 90, 8, 7509, 9001,
9, 1, 2889, 9098, 8, 74690, 9, 3, 11564,
153, 11499, 1, 11207, 59713, 0, 2, 4957, 772,
29521, 282, 1, 55, 9098, 2, 9001, 70, 1760,
7, 17886, 2, 1, 457],
[ 467, 11, 12987, 104, 9659, 50830, 1039, 0, 24,
1378, 2320, 11, 22177, 3753, 1221, 235, 571, 483,
321, 50, 4, 324, 20038, 25, 254, 106, 638,
1047, 13465, 254, 53153, 2228, 1, 1140, 7, 26,
2786, 2228, 14316, 2, 12481, 1, 53, 25, 54,
3817, 87, 1042, 2, 638]]), array([2, 5]))
(array([[ 0, 4590, 1307, 1534, 172, 4780, 25640, 16712, 59,
4, 1005, 11305, 2, 37, 155, 1307, 217, 335,
396, 32, 1789, 9567, 29, 1, 0, 4590, 191,
1307, 1534, 17, 1607, 282, 8, 0, 9, 16,
512, 494, 1055, 4007, 3, 795, 44, 1, 0,
4590, 1613, 19384, 0, 173],
[ 261, 27, 130, 23, 1114, 10742, 1995, 8987, 2170,
2182, 17, 36349, 0, 978, 16, 0, 4438, 0,
0, 0, 0, 18606, 0, 10782, 12732, 17260, 0,
0, 1455, 0, 0, 0, 14071, 0, 0, 0,
61689, 16564, 65709, 0, 25097, 4129, 23181, 0, 1455,
1981, 0, 200, 2125, 0]]), array([2, 4]))

def create_model(hps, vocab_size, num_classes):
    num_timesteps = hps.num_timesteps
    batch_size = hps.batch_size
    inputs = tf.placeholder(tf.int32, (batch_size, num_timesteps))
    outputs = tf.placeholder(tf.int32, (batch_size,))
    #随机失活剩下的神经单元  keep_prob = 1-dropout
    keep_prob = tf.placeholder(tf.float32, name= 'keep_prob')
    #save training_step
    global_step = tf.Variable(tf.zeros([], tf.int64), name='global_step',trainable=False)
    embedding_initializer = tf.random_uniform_initializer(-1.0, 1.0)
    with tf.variable_scope('embedding', initializer=embedding_initializer):
        embeddings = tf.get_variable('embedding',[vocab_size, hps.num_embedding_size],tf.float32)
        embed_inputs = tf.nn.embedding_lookup(embeddings, inputs)#change inputs to embedding
    scale = (1.0 / math.sqrt(hps.num_embedding_size+hps.num_lstm_nodes[-1])) * 3.0
    lstm_init = tf.random_uniform_initializer(-scale, scale)
    def _generate_params_for_lstm_cell(x_size, h_size, bias_size):
        x_w = tf.get_variable('x_weights', x_size)
        h_w = tf.get_variable('h_weights', h_size)
        b = tf.get_variable('biases', bias_size, initializer=tf.constant_initializer(0.0))
        return x_w, h_w, b
    with tf.variable_scope('lstm_nn', initializer=lstm_init):
        with tf.variable_scope('inputs'):
            ix, ih, ib = _generate_params_for_lstm_cell(
                x_size=[hps.num_embedding_size, hps.num_lstm_nodes[0]],
                h_size=[hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]],
                bias_size=[1, hps.num_lstm_nodes[0]]
        with tf.variable_scope('outputs'):
            ox, oh, ob = _generate_params_for_lstm_cell(
                x_size=[hps.num_embedding_size, hps.num_lstm_nodes[0]],
                h_size=[hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]],
                bias_size=[1, hps.num_lstm_nodes[0]]
        with tf.variable_scope('forget'):
            fx, fh, fb = _generate_params_for_lstm_cell(
                x_size=[hps.num_embedding_size, hps.num_lstm_nodes[0]],
                h_size=[hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]],
                bias_size=[1, hps.num_lstm_nodes[0]]
        with tf.variable_scope('memory'):
            cx, ch, cb = _generate_params_for_lstm_cell(
                x_size=[hps.num_embedding_size, hps.num_lstm_nodes[0]],
                h_size=[hps.num_lstm_nodes[0], hps.num_lstm_nodes[0]],
                bias_size=[1, hps.num_lstm_nodes[0]]
        state = tf.Variable(tf.zeros([batch_size, hps.num_lstm_nodes[0]]),trainable=False)
        h = tf.Variable(tf.zeros([batch_size, hps.num_lstm_nodes[0]]),trainable=False)
        for i in range(num_timesteps):
            embed_input = embed_inputs[:, i, :]
            embed_input = tf.reshape(embed_input, [batch_size, hps.num_embedding_size])
            forget_gate = tf.sigmoid(tf.matmul(embed_input, fx) + tf.matmul(h, fh) + fb)
            input_gate = tf.sigmoid(tf.matmul(embed_input, ix) + tf.matmul(h, ih) + ib)
            output_gate = tf.sigmoid(tf.matmul(embed_input, ox) + tf.matmul(h, oh) + ob)
            mid_state = tf.tanh(tf.matmul(embed_input, cx) + tf.matmul(h, ch) + cb)
            state = mid_state * input_gate + state * forget_gate
            h = output_gate * tf.tanh(state)
            last = h
        cells = []
        for i in range(hps.num_lstm_layers):
            cell = tf.contrib.rnn.BasicLSTMCell(hps.num_lstm_nodes[i], state_is_tuple = True)
            cell = tf.contrib.rnn.DropoutWrapper(cell,output_keep_prob=keep_prob)
        cell = tf.contrib.rnn.MultiRNNCell(cells)#Cell is 多层LSTM
        initial_state = cell.zero_state(batch_size,tf.float32) #初始化隐藏状态为0
        #    一维:batch_size
        #    二维:num_timesteps
        #    三维:lstm_outputs[-1]
        rnn_outputs, _ = tf.nn.dynamic_rnn(cell,embed_inputs,initial_state=initial_state)
        last = rnn_outputs[:, -1, : ]
    fc_init = tf.uniform_unit_scaling_initializer(factor=1.0)
    with tf.variable_scope('fc', initializer=fc_init):
        fc1 = tf.layers.dense(last, hps.num_fc_nodes, activation=tf.nn.relu, name='fc1')
        fc1_dropout = tf.contrib.layers.dropout(fc1, keep_prob)
        logits = tf.layers.dense(fc1_dropout, num_classes, name='fc2')
    with tf.name_scope('metrics'):
        softmax_loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=outputs)
        loss = tf.reduce_mean(softmax_loss)
        y_pred = tf.argmax(tf.nn.softmax(logits), 1, output_type=tf.int32)
        correct_pred = tf.equal(outputs, y_pred)
        accuary = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
