使用文本序列的word2vec词向量作为seq2seq模型的输入和输出,训练得到中间层的文本特征表示,可进一步进行分类任务等,encoder和decoder都使用LSTM。
import tensorflow as tf
import numpy as np
import re
from gensim.models import Word2Vec
import pandas as pd
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings("ignore")
#导入Word2vec词向量模型
model = Word2Vec.load('model/daixia_w2c_char_100.model')
#超参数
num_units = 256
input_size = 100
batch_size = 5
vocab_size = 946
# 读取词典,包括病案所有字,还有结束符号EOS
def get_dict():
f = open('data/char_dict.txt', 'r', encoding='utf-8')
dict_char = dict()
dict_id = dict()
for i in range(0, 946):
word = f.readline()
dict_char[re.sub('\n', '', word)] = i
dict_id[i] = re.sub('\n', '', word)
return dict_char, dict_id
# 获取数据,病案的字序列,生成训练数据的batch
def get_batches(filename, dict_char, batch_size):
# 文本,分类,对应字典序号,文本长度
texts = []
label = []
targets = []
length = []
# 记录文本信息及标签,序号
data = pd.read_csv(filename, delimiter=',', encoding='utf-8')
for i in range(data.shape[0]):
char_list = re.split(' ', data['text'].loc[i])
texts.append(char_list)
label.append(data['label'].loc[i])
target = [dict_char[char] for char in char_list]
targets.append(target)
# 记录每个文本的长度
for t in texts:
length.append(len(t))
length = np.array(length, dtype=np.int32)
# #返回整个数据集
# return texts,targets
# batches生成器
i = 0
while True:
yield texts[i:i + batch_size], targets[i:i + batch_size]
i = i + batch_size
if i + batch_size > len(texts):
i = 0
#将string序列转化为词向量,格式转化为time_major
def make_batch(texts, isTargets=False, max_sequence_length=None):
sequence_lengths = [len(text) for text in texts]
batch_size = len(texts)
if max_sequence_length is None:
max_sequence_length = max(sequence_lengths)
if isTargets is False:
inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length, input_size], dtype=np.float32)
for i, text in enumerate(texts):
for j, char in enumerate(text):
inputs_batch_major[i, j] = model[char]
else:
inputs_batch_major = np.zeros(shape=[batch_size, max_sequence_length], dtype=np.int32)
for i, target in enumerate(texts):
for j, t in enumerate(target):
inputs_batch_major[i, j] = t
inputs_time_major = inputs_batch_major.swapaxes(0, 1)
return inputs_time_major
#构建训练计算图
train_graph = tf.Graph()
with train_graph.as_default():
encoder_inputs = tf.placeholder(shape=[None, batch_size, input_size], dtype=tf.float32, name='encoder_inputs')
decoder_inputs = tf.placeholder(shape=[None, batch_size, input_size], dtype=tf.float32, name='decoder_inputs')
decoder_targets = tf.placeholder(shape=(None, None), dtype=tf.int32, name='decoder_targets')
#LSTM encoder
encoder_cell = tf.contrib.rnn.LSTMCell(num_units)
encoder_outputs, encoder_final_state = tf.nn.dynamic_rnn(
encoder_cell, encoder_inputs,
dtype=tf.float32, time_major=True,
)
#LSTM decoder
decoder_cell = tf.contrib.rnn.LSTMCell(num_units)
decoder_outputs, decoder_final_state = tf.nn.dynamic_rnn(
decoder_cell, decoder_inputs,
initial_state=encoder_final_state,
dtype=tf.float32, time_major=True, scope="plain_decoder",
)
#分类层
decoder_logits = tf.contrib.layers.linear(decoder_outputs, vocab_size)
decoder_prediction = tf.argmax(decoder_logits, 2)
stepwise_cross_entropy = tf.nn.softmax_cross_entropy_with_logits(
labels=tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32),
logits=decoder_logits,
)
#计算精确度
correct_prediction = tf.equal(decoder_prediction,
tf.argmax(tf.one_hot(decoder_targets, depth=vocab_size, dtype=tf.float32), 2))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
#交叉熵损失
loss = tf.reduce_mean(stepwise_cross_entropy)
#优化
train_op = tf.train.AdamOptimizer().minimize(loss)
#创建saver保存模型
saver = tf.train.Saver()
if __name__ == '__main__':
# 记录损失
loss_track = []
epochs = 10001
# 读取字典
dict_char, dict_id = get_dict()
# batch生成器
gen_batches = get_batches('data/data_char.csv', dict_char, batch_size)
#开启会话
with tf.Session(graph=train_graph) as sess:
sess.run(tf.global_variables_initializer())
time_start = time.time()
for epoch in range(epochs):
batch = next(gen_batches)
texts = batch[0]
targets = batch[1]
#EOS为句子结束符号,在字典中对应0,decoder输入以结束符号开始,decoder的targets以结束符号为结尾
encoder_inputs_ = make_batch(texts)
decoder_inputs_ = make_batch([['EOS'] + text for text in texts])
decoder_targets_ = make_batch([target + [0] for target in targets], True, None)
feed_dict = {encoder_inputs: encoder_inputs_, decoder_inputs: decoder_inputs_,
decoder_targets: decoder_targets_,
}
_, l, acc = sess.run([train_op, loss, accuracy], feed_dict)
loss_track.append(l)
#展示预测效果
if epoch == 0 or epoch % 10 == 0:
print('loss: {}'.format(sess.run(loss, feed_dict)))
print('acc: {}'.format(sess.run(accuracy, feed_dict)))
predict_ = sess.run(decoder_prediction, feed_dict)
for i, (inp, pred) in enumerate(zip(texts, predict_.T)):
print('input > {}'.format(inp))
print('predicted > {}'.format([dict_id[id] for id in pred]))
if i >= 2:
break
time_span = time.time() - time_start
print('训练花费了{}'.format(time_span))
saver.save(sess, 'model/dl/model.ckpt')
plt.plot(loss_track)
plt.show()