1 文本数据介绍及来源
本文使用的数据集为网站爬取的短文本评论,其中由于数据量较少,不区分验证集与测试集,并按照4:1的比例将原始有标签数据分为训练集与测试集。
原始数据文件压缩包网盘链接:https://pan.baidu.com/s/13vwd3lfKWfXlD1a8uB6ngg
提取码:urj2
注:解压后的text_data文件夹放置在与程序的同级目录下。
原始数据以txt文本格式保存,标签分为pos与neg,因此预处理主要是将标签与数据分开对应保存,并将数据分为训练集与测试集。
2 模型网络结构
![](https://i-blog.csdnimg.cn/blog_migrate/a84a1857dcece5b737e6174d87314ea8.jpeg)
如上图所示为LSTM进行本文所示的短文本情感分类的原理示意图。
其中,对于每一条短文本样本而言,里面每一个字Wt的矩阵形状为(,1,60),其中第一个维度表示样本数,第二个维度值为1表示一个字,第三个维度用长为60的向量表示这个字Wt对应的embedding_vector。
在网络超参数中,num_hidden_units 即LSTM细胞隐藏层大小设置为256,即图中使用黄色矩形代表的softmax层,具体的输入输出关系如图所示。最终我们获得整个所有cell的Ot以及最后一个cell的state(此state后续并未使用)。
3 数据预处理及字库构建
import os
# 获取text_data文件夹下的所有文件路径
temp_list = list(os.walk(r"text_data"))
original = temp_list[0][0]
file_name = temp_list[0][2]
path_list = [original + "\\" + eve_name for eve_name in file_name]
# 创建所需文件
train_data = open(r"train_data.txt", "w", encoding="utf-8")
train_label = open(r"train_label.txt", "w", encoding="utf-8")
test_data = open(r"test_data.txt", "w", encoding="utf-8")
test_label = open(r"test_label.txt", "w", encoding="utf-8")
vocabulary = open(r"vocabulary.txt", "w", encoding="utf-8")
# 将原始数据进行标签分离与训练测试集分离
for every_path in path_list:
with open(every_path, "r", encoding="utf-8") as temp_file:
corpus = [eve for eve in temp_file if len(eve.strip("\n")) != 0]
limit1 = len(corpus)*0.9
limit2 = len(corpus)*0.1
for i in range(len(corpus)):
if limit2 < i < limit1:
if corpus[i][:3] == "pos":
train_data.write(corpus[i][3:])
train_label.write("1" + "\n")
else:
train_data.write(corpus[i][3:])
train_label.write("0" + "\n")
else:
if corpus[i][:3] == "pos":
test_data.write(corpus[i][3:])
test_label.write("1" + "\n")
else:
test_data.write(corpus[i][3:])
test_label.write("0" + "\n")
# 创建字库vocabulary,包含原始数据中所有的字,写入vocabulary.txt待用
with open(r"test_data.txt", "r", encoding="utf-8") as file1:
corpus1 = [eve for eve in file1]
with open(r"train_data.txt", "r", encoding="utf-8") as file2:
corpus2 = [eve for eve in file2]
with open(r"vocabulary.txt","w",encoding="utf-8") as file3:
word_list = []
corpus = corpus1 + corpus2
for line in corpus:
for word in line:
word_list.append(word)
word_list = list(set(word_list))
for word in word_list:
file3.write(word + "\n")
4 tensorflow + LSTM实现文本情感二分类
按照上图所示模型,具体程序如下所示。
with open(r"train_data.txt", "r", encoding="utf-8") as file1:
corpus = [eve.strip("\n") for eve in file1]
with open(r"vocabulary.txt", "r", encoding="utf-8") as file2:
vocabulary = [word.strip("\n") for word in file2]
with open(r"train_label.txt", "r", encoding="utf-8") as file3:
label_list = [int(eve.strip("\n")) for eve in file3]
assert len(label_list) == len(corpus)
word2id = {word:id_ for id_, word in enumerate(vocabulary)}
line2id = lambda line: [word2id[word] for word in line] # 将一句话转化为id的列表list
train_list = [line2id(line) for line in corpus]
import tensorflow.contrib.keras as kr
train_x = kr.preprocessing.sequence.pad_sequences(train_list, 100) # 长度一致train_x
train_y = kr.utils.to_categorical(label_list, num_classes=2)
import tensorflow as tf
tf.compat.v1.reset_default_graph()
X_holder = tf.compat.v1.placeholder(tf.int32, [None, 100]) # 占位
Y_holder = tf.compat.v1.placeholder(tf.float32, [None, 2])
# 做词嵌入工作 注意2775是自由生成的行向量,必须大于等于实际的数据
embedding = tf.compat.v1.get_variable('embedding', [2775, 60]) # 一种初始化变量的方法,随机初始化了矩阵变量
embedding_inputs = tf.nn.embedding_lookup(embedding, X_holder)
num_hidden_units = 256 # LSTM细胞隐藏层大小
num_fc1_units = 64 # 第1个全连接下一层的大小
dropout_keep_probability = 0.5 # dropout保留比例
learning_rate = 1e-2 # 学习率
# 整体结构:输入-LSTM-全连接层-dropout-全连接层-二分类softmax层
gru_cell = tf.contrib.rnn.BasicLSTMCell(num_hidden_units) # 实例化一个LSTM单元,此处也可以直接替换为GRU单元
outputs, _ = tf.nn.dynamic_rnn(gru_cell,
embedding_inputs,
dtype=tf.float32) # _为最后一个cell的state
last_cell = outputs[:, -1, :] # 取出最后一个cell的outputs做输出,多对一
full_connect1 = tf.layers.dense(last_cell,
num_fc1_units)
full_connect1_dropout = tf.contrib.layers.dropout(full_connect1,
dropout_keep_probability)
full_connect1_activate = tf.nn.relu(full_connect1_dropout)
full_connect2 = tf.layers.dense(full_connect1_activate, 2) # 连接二分类器
predict_Y = tf.nn.softmax(full_connect2) # 激活
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y_holder, logits=full_connect2)
loss = tf.reduce_mean(cross_entropy) # 熵的平均值
optimizer = tf.train.AdamOptimizer(learning_rate) # 定义优化器
train = optimizer.minimize(loss) # 将优化器与损失值连接起来
isCorrect = tf.equal(tf.argmax(Y_holder, 1), tf.argmax(predict_Y, 1)) # 判断是否正确
accuracy = tf.reduce_mean(tf.cast(isCorrect, tf.float32)) # 判断准确率
init = tf.global_variables_initializer() # 初始化参数
session = tf.Session()
session.run(init)
# 读取测试集进行测试
with open(r"test_data.txt", "r", encoding="utf-8") as file4:
corpus_ = [eve.strip("\n") for eve in file4]
with open(r"test_label.txt", "r", encoding="utf-8") as file5:
label_list_ = [int(eve.strip("\n")) for eve in file5]
assert len(label_list_) == len(corpus_)
test_list = [line2id(line) for line in corpus_]
test_x = kr.preprocessing.sequence.pad_sequences(test_list, 100) # 长度一致train_x
test_y = kr.utils.to_categorical(label_list_, num_classes=2)
import random
for i in range(3000):
selected_index = random.sample(list(range(len(train_y))), k=60) # 批训练
batch_X = train_x[selected_index]
batch_Y = train_y[selected_index]
session.run(train, {X_holder:batch_X, Y_holder:batch_Y})
step = i + 1
if step % 100 == 0:
selected_index = random.sample(list(range(len(test_y))), k=150)
batch_X = test_x[selected_index]
batch_Y = test_y[selected_index]
loss_value, accuracy_value = session.run([loss, accuracy], {X_holder:batch_X, Y_holder:batch_Y})
print('step:%d loss:%.4f accuracy:%.4f' %(step, loss_value, accuracy_value))
6 运行结果
step:100 loss:0.4196 accuracy:0.8400
step:200 loss:0.2480 accuracy:0.9000
step:300 loss:0.2244 accuracy:0.9200
step:400 loss:0.1739 accuracy:0.9333
step:500 loss:0.3256 accuracy:0.9000
step:600 loss:0.3013 accuracy:0.9000
step:700 loss:0.1742 accuracy:0.9333
step:800 loss:0.3055 accuracy:0.9000
step:900 loss:0.2212 accuracy:0.9067
step:1000 loss:0.3092 accuracy:0.8800
step:1100 loss:0.2734 accuracy:0.9000
step:1200 loss:0.2690 accuracy:0.8733
step:1300 loss:0.2409 accuracy:0.9067
step:1400 loss:0.3447 accuracy:0.8800
step:1500 loss:0.2810 accuracy:0.8933
step:1600 loss:0.2851 accuracy:0.9200
step:1700 loss:0.2294 accuracy:0.9333
step:1800 loss:0.2828 accuracy:0.8933
step:1900 loss:0.3330 accuracy:0.8933
step:2000 loss:0.3764 accuracy:0.9200
step:2100 loss:0.1498 accuracy:0.9200
step:2200 loss:0.1861 accuracy:0.9467
step:2300 loss:0.2425 accuracy:0.9133
step:2400 loss:0.2472 accuracy:0.9333
step:2500 loss:0.3103 accuracy:0.8933
step:2600 loss:0.2019 accuracy:0.9200
step:2700 loss:0.1626 accuracy:0.9333
step:2800 loss:0.5135 accuracy:0.9200
step:2900 loss:0.3478 accuracy:0.9000
step:3000 loss:0.3352 accuracy:0.9133