参考:Implementing a CNN for Text Classification in TensorFlow
数据集及预处理
该数据集包含10,662个评论,一半正面和一半负面。 数据集的大小约为20k,由于此数据集非常小,使用功能强大的模型可能出现过拟合。 此外,数据集没有官方训练/测试的拆分,因此我们使用10%的数据作为验证集。 原始论文报告了对数据进行10倍交叉验证的结果。
1.从原始数据文件加载正面和负面的句子
2.使用代码清洁文本数据
3.将每个句子填充到59,将<PAD>标记附加到所有其他句子,从而有效地批量处理数据
4.构建词汇索引并将每个单词映射到0到18,765之间的整数(词汇量大小), 每个句子都成为整数的向量
数据预处理部分写在process_data.py文件中,这里对数据从文件的读取进行介绍:
def load_data_and_labels(positive_data_file,negative_data_file):
"读取正面,负面评论保存在列表中"
positive_examples=list(open(positive_data_file,"r",encoding='Windows-1252').readlines())
"去除字符串头尾空格"
positive_examples=[s.strip() for s in positive_examples]
negative_examples=list(open(negative_data_file,"r",encoding='Windows-1252').readlines())
negative_examples=[s.strip() for s in negative_examples]
"调用clean_str()函数对评论进行处理,按单词进行分割,保存在x_text列表中"
x_text=positive_examples+negative_examples
x_text=[clean_str(sent) for sent in x_text]
"为每个评论添加标签,并保存在y中"
positive_labels=[[0,1] for _ in positive_examples]
negative_labels=[[1,0] for _ in negative_examples]
y=np.concatenate([positive_labels,negative_labels],0)
return [x_text,y]
处理样本得到vocabulary,获取最大句子长度76,进行padding,每个句子样本根据单词索引列表被转化为长76的行向量:
#载入实验数据
print("Load data...")
x_text,y=process_data.load_data_and_labels(FLAGS.positive_data_file,FLAGS.negative_data_file)
#获得句子的最大长度,用于padding。这里值为76
max_document_length=max([len(x.split(" ")) for x in x_text])
"""调用tf内部函数VocabularyProcessor读取x_text,按词语出现顺序构建Vocabulary,给每个单词索引号
返回嵌套列表x,形如[[1,2,3,4...],...,[55,66,777...]]
内列表代表每个句子,其值是评论中每个词在vocabulary中的索引,故x是10662*76维"""
vocab_processor=learn.preprocessing.VocabularyProcessor(max_document_length)
x=np.array(list(vocab_processor.fit_transform(x_text)))
#将数据随机打乱
np.random.seed(10)
shuffle_indices=np.random.permutation(np.arange(len(y)))
x_shuffled=x[shuffle_indices]
y_shuffled=y[shuffle_indices]
#构建训练集和验证集
dev_sample_index=1+int(FLAGS.dev_sample_percentage*float(len(y)))
x_train,x_dev=x_shuffled[:dev_sample_index],x_shuffled[dev_sample_index:]
y_train,y_dev=y_shuffled[:dev_sample_index],y_shuffled[dev_sample_index:]
del x,y,x_shuffled,y_shuffled
return x_train,y_train,vocab_processor,x_dev,y_dev
CNN神经网络的构建
代码在text_cnn.py中,使用TextCNN类来构建神经网络,通过实现init()函数搭建:
根据超参数以及输入来定义cnn模型架构
sequence_length:句子长度76
num_classes:分为两类
vocab_size:词典大小12763
embedding_size:映射维数128
filter_sizes:卷积器大小[3,4,5]
num_filters:卷积器个数128
l2_reg_lambda:正则化参数0
class TextCNN(object):
def __init__(
self,sequence_length,num_classes,vocab_size,
embedding_size,filter_sizes,num_filters,l2_reg_lambda=0.0):
"输入、输出、dropout比率的占位符定义"
self.input_x=tf.placeholder(tf.int32,[None,sequence_length],name="input_x")
self.input_y=tf.placeholder(tf.float32,[None,num_classes],name="input_y")
self.dropout_keep_prob=tf.placeholder(tf.float32,name="drop_keep_prob")
"跟踪l2正则化损失(可选)"
l2_loss=tf.constant(0.0)
"输入层,根据输入x中每个单词在voca中的索引经过lookup得到其词向量。这里词向量使用随机初始化W,而并未使用已经训练好的word2vec。"
with tf.device('/cpu:0'),tf.name_scope("embedding"):
self.W=tf.Variable(
tf.random_uniform([vocab_size,embedding_size],-1.0,1.0),
name="W")
self.embedded_chars=tf.nn.embedding_lookup(self.W,self.input_x)
self.embedded_chars_expanded=tf.expand_dims(self.embedded_chars,-1)
"为每一个filter_size创建卷积层+池化层。并将最后的结果合成一个大的特征向量"
pooled_outputs=[]
for i,filter_size in enumerate(filter_sizes):
with tf.name_scope("conv-maxpool-%s"%filter_size):
#构建卷积核尺寸,输入和输出channel分别为1和num_filters
filter_shape=[filter_size,embedding_size,1,num_filters]
W=tf.Variable(tf.truncated_normal(filter_shape,stddev=0.1),name="W")
b=tf.Variable(tf.constant(0.1,shape=[num_filters]),name="b")
conv=tf.nn.conv2d(
self.embedded_chars_expanded,
W,
strides=[1,1,1,1],
padding='VALID',
name="conv")
#非线性操作,激活函数:relu(W*x+b)
h=tf.nn.relu(tf.nn.bias_add(conv,b),name="relu")
#池化,选取卷积结果的最大值pooled的尺寸为[None,1,1,128](卷积核个数)
pooled=tf.nn.max_pool(
h,
ksize=[1,sequence_length-filter_size+1,1,1],
strides=[1,1,1,1],
padding='VALID',
name="pool")
#pooled_outs最终为一个长度为3的列表。每个元素都是[None,1,1,128]的Tensor张量
pooled_outputs.append(pooled)
#结合所有池化特征
num_filters_total=num_filters*len(filter_sizes)
#对pooled_outputs在第四个维度上进行合并,变成一个[None,1,1,384]Tensor张量
self.h_pool=tf.concat(pooled_outputs,3)
#展开成两维Tensor[None,384]
self.h_pool_flat=tf.reshape(self.h_pool,[-1,num_filters_total])
#添加dropout
with tf.name_scope("dropout"):
self.h_drop=tf.nn.dropout(self.h_pool_flat,self.dropout_keep_prob)
#全连接计算输出向量(W*h+b)和预测(scores向量中的最大值即为预测结果)
with tf.name_scope("output"):
W=tf.get_variable(
"W",
shape=[num_filters_total,num_classes],
initializer=tf.contrib.layers.xavier_initializer())
b=tf.Variable(tf.constant(0.1,shape=[num_classes]),name="b")
l2_loss+=tf.nn.l2_loss(W)
l2_loss+=tf.nn.l2_loss(b)
self.scores=tf.nn.xw_plus_b(self.h_drop,W,b,name="scores")
self.predictions=tf.argmax(self.scores,1,name="predictions")
with tf.name_scope("loss"):
losses=tf.nn.softmax_cross_entropy_with_logits(logits=self.scores,labels=self.input_y)
self.loss=tf.reduce_mean(losses)+l2_reg_lambda*l2_loss
#计算score和input_y的交叉熵损失函数
with tf.name_scope("accuracy"):
correct_predictions=tf.equal(self.predictions,tf.argmax(self.input_y,1))
self.accuracy=tf.reduce_mean(tf.cast(correct_predictions,"float"),name="accuracy")
对上述代码中使用的常用函数进行总结:
1.tf.nn.conv2d(input, filter, strides, padding, use_cudnn_on_gpu=None, data_format=None, name=None)
input:需要做卷积的输入句子或文本,是一个Tensor,具有[batch_size, in_height, in_width, in_channels],即[训练时一个batch的句子数量,每个词向量的长度, 句子长度, 文本通道数],注意这是一个4维的Tensor,要求数据类型为float32和float64其中之一
filter:用于指定CNN中的卷积核,是一个Tensor,具有[filter_height, filter_width, in_channels, out_channels],即[卷积核的高度,卷积核的宽度,文本通道数,卷积核个数],其中第三维in_channels与参数input的第四维的维度一致
strides:卷积时在文本每一维的步长,一维向量,长度为4,对应input的4个维度上的步长
padding:string类型变量,只能是"SAME"或"VALID",SAME代表卷积核可以停留图像边缘,VALID表示不能
2.我们用一个简单的线性回归介绍一下,模型的搭建及训练
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
#准备训练数据
train_X=np.linspace(-1,1,100)
train_Y=2*train_X+np.random.randn(*train_X.shape)*0.33+10
#定义模型
X=tf.placeholder("float")
Y=tf.placeholder("float")
w=tf.Variable(0.0,name="weight")
b=tf.Variable(0.0,name="bias")
loss=tf.square(Y-X*w-b)
#优化器
train_op=tf.train.GradientDescentOptimizer(0.01).minimize(loss)
#建立session
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
epoch=1
for i in range(10):
for (x,y) in zip(train_X,train_Y):
_,w_value,b_value=sess.run([train_op,w,b],feed_dict={X:x,Y:y})
print("Epoch: {}, w: {}, b: {}".format(epoch,w_value,b_value))
epoch+=1
#图像展示拟合结果
plt.plot(train_X,train_Y,"+")
plt.plot(train_X,train_X.dot(w_value)+b_value)
plt.show()
训练过程
训练部分的代码在train.py中,传入代码到CNN中开始一步步训练:
with tf.Graph().as_default():
"allow_soft_placement:允许动态分配内存,log_device_placement:打印出设备信息"
session_conf=tf.ConfigProto(
allow_soft_placement=FLAGS.allow_soft_placement,
log_device_placement=FLAGS.log_device_placement)
sess=tf.Session(config=session_conf)
with sess.as_default():
"""
根据超参数以及输入定义cnn模型架构
句子长度76,分为两类,词典大小12763,
映射维数128,卷积器大小[3,4,5],卷积器大小128,正则化参数0
"""
cnn=TextCNN(
sequence_length=x_train.shape[1],
num_classes=y_train.shape[1],
vocab_size=len(vocab_processor.vocabulary_),
embedding_size=FLAGS.embedding_dim,
filter_sizes=list(map(int,FLAGS.filter_sizes.split(","))),
num_filters=FLAGS.num_filters,
l2_reg_lambda=FLAGS.l2_reg_lambda)
#定义训练过程
#trainable=False表明该参数虽然是Variable但并不属于网络运行参数,无需计算梯度并更新
global_step=tf.Variable(0,name="global_step",trainable=False)
optimizer=tf.train.AdamOptimizer(1e-3)
grads_and_vars=optimizer.compute_gradients(cnn.loss)
#计算梯度并根据AdamOptimizer优化函数更新网络参数
train_op=optimizer.apply_gradients(grads_and_vars,global_step=global_step)
#跟踪梯度值和稀疏度(可选)
#将一些想要在TensorBoard中观察的网络参数记录下来,保存到Summary中
grad_summaries=[]
for g,v in grads_and_vars:
if g is not None:
grad_hist_summary=tf.summary.histogram("{}/grad/hist".format(v.name),g)
sparsity_summary=tf.summary.scalar("{}/grad/sparsity".format(v.name),tf.nn.zero_fraction(g))
grad_summaries.append(grad_hist_summary)
grad_summaries.append(sparsity_summary)
grad_summaries_merged=tf.summary.merge(grad_summaries)
#模型和Summary的输出目录
timestamp=str(int(time.time()))
out_dir=os.path.abspath(os.path.join(os.path.curdir,"runs",timestamp))
print("Writing to {}\n".format(out_dir))
#loss和accuracy的Summary
loss_summary=tf.summary.scalar("loss",cnn.loss)
acc_summary=tf.summary.scalar("accuracy",cnn.accuracy)
#训练Summary
train_summary_op=tf.summary.merge([loss_summary,acc_summary,grad_summaries_merged])
train_summary_dir=os.path.join(out_dir,"summaries","train")
train_summary_writer=tf.summary.FileWriter(train_summary_dir,sess.graph)
#验证Summary
dev_summary_op=tf.summary.merge([loss_summary,acc_summary])
dev_summary_dir=os.path.join(out_dir,"summaries","dev")
dev_summary_writer=tf.summary.FileWriter(dev_summary_dir,sess.graph)
#Checkpoint directory: Tensorflow假设存在,因此我们需要创建它
checkpoint_dir=os.path.abspath(os.path.join(out_dir,"checkpoints"))
checkpoint_prefix=os.path.join(checkpoint_dir,"model")
if not os.path.exists(checkpoint_dir):
os.makedirs(checkpoint_dir)
saver=tf.train.Saver(tf.global_variables(),max_to_keep=FLAGS.num_checkpoints)
# Write vocabulary保存vocabulary
vocab_processor.save(os.path.join(out_dir,"vocab"))
#初始化所有变量
sess.run(tf.global_variables_initializer())
def train_step(x_batch,y_batch):
"""
A single training step
"""
#定义要传入的数据
feed_dict={
cnn.input_x:x_batch,
cnn.input_y:y_batch,
cnn.dropout_keep_prob:FLAGS.dropout_keep_prob
}
#sess.run()运行一次网络优化,并将相应信息输出
_,step,summaries,loss,accuracy=sess.run(
[train_op,global_step,train_summary_op,cnn.loss,cnn.accuracy],
feed_dict)
time_str=datetime.datetime.now().isoformat()
print("{}:step {}, loss {:g}, acc {:g}".format(time_str,step,loss,accuracy))
train_summary_writer.add_summary(summaries,step)
#验证集效果检测
def dev_step(x_batch,y_batch,writer=None):
"""
Evaluates model on a dev set
"""
feed_dict={
cnn.input_x:x_batch,
cnn.input_y:y_batch,
cnn.dropout_keep_prob:1.0
}
step,summaries,loss,accuracy=sess.run(
[global_step,dev_summary_op,cnn.loss,cnn.accuracy],
feed_dict)
time_srt=datetime.datetime.now().isoformat()
print("{}: step {}, loss {:g}, acc {:g}".format(time_srt,step,loss,accuracy))
if writer:
writer.add_summary(summaries,step)
#生成batches
batches=process_data.batch_iter(
list(zip(x_train,y_train)),FLAGS.batch_size,FLAGS.num_epochs)
#每一批的训练,64*2
for batch in batches:
x_batch,y_batch=zip(*batch)
train_step(x_batch,y_batch)
current_step=tf.train.global_step(sess,global_step)
if current_step % FLAGS.evaluate_every==0:
print("\nEvaluation:")
dev_step(x_dev,y_dev,writer=dev_summary_writer)
print("")
if current_step%FLAGS.checkpoint_every==0:
path=saver.save(sess,checkpoint_prefix,global_step=current_step)
print("Saved model checkpoint to {}\n".format(path))