词汇表中所有单词的向量维度:[VOCAB,EMB_SIZE]
每个batch中的输入维度input_data:[batch_size,num_steps]
经tf.nn.embedding_lookup提取词向量后,输入维度input_embedding:[batch_size,num_steps,EMB_SIZE]
在训练的每一个时刻,输入的维度[batch_size,EMB_SIZE],输出的维度[batch_size,hidden_size]
共有num_steps个时刻
经reshape后循环模型的输出维度[batch_size*num_steps,hidden_size]
在Softmax层,输出维度[batch_size*num_steps,VOCAB_SIZE]
具体代码如下:
import numpy as np
import tensorflow as tf
TRAIN_DATA="ptb.train"
EVAL_DATA="ptb.valid"
TEST_DATA="ptb.test"
LSTM_KEEP_PROB=0.9 #LSTM节点不被dropout的概率
EMBEDDING_KEEP_PROB=0.9 #词向量不被dropout的概率
HIDDEN_SIZE=300 #隐藏层规模
NUM_LAYERS=2 #深层循环神经网络中LSTM的层数
VOCAB_SIZE=10000 #词典规模
SHARE_EMB_AND_SOFTMAX=True #在Softmax层和词向量层之间共享参数
MAX_GRAD_NORM=5 #用于控制梯度膨胀的梯度大小上限
TRAIN_BATCH_SIZE=20
TRAIN_NUM_STEP=35
EVAL_BATCH_SIZE=1
EVAL_NUM_STEP=1
NUM_EPOCH=5
class PTBModel():
def __init__(self,is_training,batch_size,num_steps):
self.batch_size=batch_size
self.num_steps=num_steps
#定义每一步的输入和预期输出
self.input_data=tf.placeholder(tf.int32,[batch_size,num_steps])
self.targets=tf.placeholder(tf.int32,[batch_size,num_steps])
dropout_keep_prob=LSTM_KEEP_PROB if is_training else 1.0
lstm_cells=[tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.BasicLSTMCell
(HIDDEN_SIZE), output_keep_prob=dropout_keep_prob)
for _ in range(NUM_LAYERS)]
cell=tf.nn.rnn_cell.MultiRNNCell(lstm_cells) #两层循环神经网络
#初始化最初的状态,即全零的向量
self.initial_state=cell.zero_state(batch_size,tf.float32)
#所有单词的词向量矩阵
embedding=tf.get_variable("embedding",[VOCAB_SIZE,HIDDEN_SIZE])
#将输入单词转化为词向量矩阵
inputs=tf.nn.embedding_lookup(embedding,self.input_data)
#inputs的shape:[batch_size,num_steps,HIDDEN_SIZE]
if is_training:
inputs=tf.nn.dropout(inputs,EMBEDDING_KEEP_PROB)
#将num_steps个时刻LSTM的输出收集到outputs
outputs=[]
state=self.initial_state
with tf.variable_scope("RNN"):
for time_step in range(num_steps):
#在第一个时刻声明LSTM结构中使用的变量,在之后的时刻都需要复用之前定义好的变量
if time_step>0:tf.get_variable_scope().reuse_variables()
cell_output,state=cell(inputs[:,time_step,:],state)
outputs.append(cell_output)
#outputs为一个tensor维度[batch,hidden_size]的列表,列表的长度num_steps
output=tf.reshape(tf.concat(outputs,1),[-1,HIDDEN_SIZE])
#output的shape:[batch*numsteps,hidden_size]
if SHARE_EMB_AND_SOFTMAX:
weight=tf.transpose(embedding)#转置
else:
weight=tf.get_variable("weight",[HIDDEN_SIZE,VOCAB_SIZE])
bias=tf.get_variable("bias",[VOCAB_SIZE])
logits=tf.matmul(output,weight)+bias #[batch*num_steps,VOCAB_SIZE]
loss=tf.nn.sparse_softmax_cross_entropy_with_logits(labels=tf.reshape (self.targets,[-1]),logits=logits)
self.cost=tf.reduce_sum(loss)/batch_size
#计算该batch的log(perplexity),应除以(batch_size*num_steps)
#后面程序调用run_epoch时除了iters,可理解为执行完iters后的平均log(perplexity)
#self.cost=tf.reduce_mean(loss)*num_steps
self.final_state=state
if not is_training:return
trainable_variables=tf.trainable_variables()
#tf.trainable_variables返回的是需要训练的变量列表
grads,_=tf.clip_by_global_norm(
tf.gradients(self.cost,trainable_variables),MAX_GRAD_NORM)
#gradients是计算向量的导数,输入是loss和所有trainable的向量。
optimizer=tf.train.GradientDescentOptimizer(learning_rate=1.0)
# apply_gradients是tf.train.Optimizer.minimize实际操作中两步中的一步,
# minizie其实是分了两步运算,第一步计算梯度tf.train.Optimizer.compute_gradients
# 和第二步更新梯度tf.train.Optimizer.apply_gradients,
# 由于我们已经计算梯度了,所以我们只用更新梯度就可以了,
# 输入格式就是如下zip(梯度向量,变量向量)
self.train_op=optimizer.apply_gradients(zip(grads,trainable_variables))
#返回全部数据上的perplexity
def run_epoch(session,model,batches,train_op,output_log,step):
total_costs=0.0
iters=0
state=session.run(model.initial_state)
for x,y in batches:
cost,state,_=session.run([model.cost,model.final_state,train_op],
{model.input_data:x,model.targets:y,
model.initial_state:state})
total_costs+=cost
iters+=model.num_steps
if output_log:
if step % 100 == 0:
#log(perplexity)通过np.exp转化为perplexity
print("After %d steps,perplexity is %.3f"%
(step,np.exp(total_costs/iters)))
step+=1
return step,np.exp(total_costs/iters)
def read_data(file_path):
with open(file_path,"r") as fin:
id_string=' '.join([line.strip() for line in fin.readlines()])
id_list=[int(w) for w in id_string.split()]
return id_list
def make_batches(id_list,batch_size,num_step):
num_batches=(len(id_list)-1)//(batch_size*num_step)
data=np.array(id_list[:num_batches*batch_size*num_step])
data=np.reshape(data,[batch_size,num_batches*num_step])
data_batches=np.split(data,num_batches,axis=1)
label = np.array(id_list[1:num_batches * batch_size * num_step+1])
label = np.reshape(label, [batch_size, num_batches * num_step])
label_batches = np.split(label, num_batches, axis=1)
return list(zip(data_batches,label_batches))
def main():
initializer=tf.random_uniform_initializer(-0.05,0.05)
with tf.variable_scope("language_model",reuse=None,initializer=initializer):
train_model=PTBModel(True,TRAIN_BATCH_SIZE,TRAIN_NUM_STEP)
with tf.variable_scope("language_model",reuse=True,initializer=initializer):
eval_model=PTBModel(False,EVAL_BATCH_SIZE,EVAL_NUM_STEP)
with tf.Session() as session:
tf.global_variables_initializer().run()
train_batches=make_batches(read_data(TRAIN_DATA),TRAIN_BATCH_SIZE,TRAIN_NUM_STEP)
eval_batches = make_batches(read_data(EVAL_DATA), EVAL_BATCH_SIZE, EVAL_NUM_STEP)
print(len(eval_batches))
test_batches = make_batches(read_data(TEST_DATA), EVAL_BATCH_SIZE, EVAL_NUM_STEP)
print(len(test_batches))
step=0
for i in range(NUM_EPOCH):
print("In iteration: %d"%(i+1))
step, train_pplx = run_epoch (session,train_model,
train_batches,train_model.train_op,True,step)
print("Epoch: %d Train Perplexity: %.3f "%(i+1,train_pplx))
_, eval_pplx = run_epoch(session,
eval_model, eval_batches, tf.no_op(), False,0)#tf.no_op什么都不做
print("Epoch: %d Eval Perplexity: %.3f "%(i+1,eval_pplx))
_, test_pplx = run_epoch(session, eval_model, test_batches, tf.no_op(), False, 0)
print("Test Perplexity: %.3f " % (i + 1, test_pplx))
if __name__=="__main__":
main()