textCNN
数据占位符
self.input_x = tf.placeholder(tf.int32, [None, self.config.seq_length], name='input_x')
self.keep_prob = tf.placeholder(tf.float32, name='keep_prob')
使用CPU设备
with tf.device('/cpu:0'):
embedding = tf.get_variable('embedding', [self.config.vocab_size, self.config.embedding_dim]) #embedding是(字典大小,词向量维度)
embedding_inputs = tf.nn.embedding_lookup(embedding, self.input_x)
#input_x是(None,固定句子长度) embedding_inputs是(None,固定句子长度,词向量维度)
定义命名空间
with tf.name_scope("cnn"):
# CNN layer
conv = tf.layers.conv1d(embedding_inputs, self.config.num_filters, self.config.kernel_size, name='conv') #(None,596,128)
# global max pooling layer
gmp = tf.reduce_max(conv, reduction_indices=[1], name='gmp') #(None,128)
fc = tf.layers.dense(gmp, self.config.hidden_dim, name='fc1')
fc = tf.contrib.layers.dropout(fc, self.keep_prob)
fc = tf.nn.relu(fc)
self.logits = tf.layers.dense(fc, self.config.num_classes, name='fc2') #(?,10)
self.y_pred_cls = tf.argmax(tf.nn.softmax(self.logits), 1) # 预测类别 #(?,)
cross_entropy=tf.nn.softmax_cross_entropy_with_logits(logits=self.logits,labels=self.input_y) #(?,)
self.loss = tf.reduce_mean(cross_entropy)
# 优化器self.optim = tf.train.AdamOptimizer(learning_rate=self.config.learning_rate).minimize(self.loss)
# 准确率
correct_pred = tf.equal(tf.argmax(self.input_y, 1), self.y_pred_cls)
self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
gmp = tf.concat(convData,axis=1) #(?,512) 级联多个卷积核输出
tensorboard
tf.summary.scalar("loss", model.loss)
tf.summary.scalar("accuracy", model.acc)
merged_summary = tf.summary.merge_all()
writer = tf.summary.FileWriter(tensorboard_dir)
# 配置 Saver
saver = tf.train.Saver()
借助tf.keras把数据补齐到固定长度,转换为one-hot表达格式
# 使用keras提供的pad_sequences来将文本pad为固定长度
x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length)
y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id)) # 将标签转换为one-hot表示
timedelta(seconds=int(round(time_dif)))计算耗费多长时间
# 创建会话 session
session = tf.Session()
session.run(tf.global_variables_initializer())
writer.add_graph(session.graph)
打乱数据顺序 操作的都是nparray
indices = np.random.permutation(np.arange(data_len))
x_shuffle = x[indices]#操作的都是nparray
y_shuffle = y[indices]
生成器弹出
yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]
写入tensorboard
if total_batch % config.save_per_batch == 0:
# 每多少轮次将训练结果写入tensorboard scalar
s = session.run(merged_summary, feed_dict=feed_dict)
writer.add_summary(s, total_batch)
# 每多少轮次输出在训练集和验证集上的性能
feed_dict[model.keep_prob] = 1.0 # 预测的时候droupout为1,只有训练的时候才缺失
loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
# 保存最好的模型
saver.save(sess=session, save_path=save_path)
#加载评估模型
session = tf.Session()
session.run(tf.global_variables_initializer())
saver = tf.train.Saver()
saver.restore(sess=session, save_path=save_path) # 读取保存的模型
sklearn的矩阵评估参数
# 评估
print("Precision, Recall and F1-Score...")
categories = ['体育', '财经', '房产', '家居', '教育', '科技', '时尚', '时政', '游戏', '娱乐']
print(metrics.classification_report(y_test_cls, y_pred_cls, target_names=categories))
# 混淆矩阵
print("Confusion Matrix...")
cm = metrics.confusion_matrix(y_test_cls, y_pred_cls)
print(cm)
textRNN
def lstm_cell(): # lstm核
return tf.contrib.rnn.BasicLSTMCell(self.config.hidden_dim, state_is_tuple=True)
def gru_cell(): # gru核
return tf.contrib.rnn.GRUCell(self.config.hidden_dim)
def dropout(): # 为每一个rnn核后面加一个dropout层
if (self.config.rnn == 'lstm'):
cell = lstm_cell()
else:
cell = gru_cell()
return tf.contrib.rnn.DropoutWrapper(cell, output_keep_prob=self.keep_prob)
#多层RNN序列的链接
with tf.name_scope("rnn"):
# 多层rnn网络
cells = [dropout() for _ in range(self.config.num_layers)]
rnn_cell = tf.contrib.rnn.MultiRNNCell(cells, state_is_tuple=True)
#_outputs (?,600,128),600是序列长度、128是每个lstm的维度;_是(?,128)
_outputs, _ = tf.nn.dynamic_rnn(cell=rnn_cell, inputs=embedding_inputs, dtype=tf.float32)
last = _outputs[:, -1, :] # 取最后一个时序输出作为结果