tensorflow + LSTM 实现短文本评论情感分类

1 文本数据介绍及来源

本文使用的数据集为网站爬取的短文本评论,其中由于数据量较少,不区分验证集与测试集,并按照4:1的比例将原始有标签数据分为训练集与测试集。
原始数据文件压缩包网盘链接:https://pan.baidu.com/s/13vwd3lfKWfXlD1a8uB6ngg
提取码:urj2
注:解压后的text_data文件夹放置在与程序的同级目录下。
原始数据以txt文本格式保存,标签分为pos与neg,因此预处理主要是将标签与数据分开对应保存,并将数据分为训练集与测试集。

2 模型网络结构

如上图所示为LSTM进行本文所示的短文本情感分类的原理示意图。
其中,对于每一条短文本样本而言,里面每一个字Wt的矩阵形状为(,1,60),其中第一个维度表示样本数,第二个维度值为1表示一个字,第三个维度用长为60的向量表示这个字Wt对应的embedding_vector。
在网络超参数中,num_hidden_units 即LSTM细胞隐藏层大小设置为256,即图中使用黄色矩形代表的softmax层,具体的输入输出关系如图所示。最终我们获得整个所有cell的Ot以及最后一个cell的state(此state后续并未使用)。

3 数据预处理及字库构建

import os
# 获取text_data文件夹下的所有文件路径
temp_list = list(os.walk(r"text_data"))
original = temp_list[0][0]
file_name = temp_list[0][2]
path_list = [original + "\\" + eve_name for eve_name in file_name]
# 创建所需文件
train_data = open(r"train_data.txt", "w", encoding="utf-8")
train_label = open(r"train_label.txt", "w", encoding="utf-8")
test_data = open(r"test_data.txt", "w", encoding="utf-8")
test_label = open(r"test_label.txt", "w", encoding="utf-8")
vocabulary = open(r"vocabulary.txt", "w", encoding="utf-8")
# 将原始数据进行标签分离与训练测试集分离
for every_path in path_list:
    with open(every_path, "r", encoding="utf-8") as temp_file:
        corpus = [eve for eve in temp_file if len(eve.strip("\n")) != 0]
        limit1 = len(corpus)*0.9
        limit2 = len(corpus)*0.1
        for i in range(len(corpus)):
            if limit2 < i < limit1:
                if corpus[i][:3] == "pos":
                    train_data.write(corpus[i][3:])
                    train_label.write("1" + "\n")
                else:
                    train_data.write(corpus[i][3:])
                    train_label.write("0" + "\n")
            else:
                if corpus[i][:3] == "pos":
                    test_data.write(corpus[i][3:])
                    test_label.write("1" + "\n")
                else:
                    test_data.write(corpus[i][3:])
                    test_label.write("0" + "\n")
# 创建字库vocabulary,包含原始数据中所有的字,写入vocabulary.txt待用
with open(r"test_data.txt", "r", encoding="utf-8") as file1:
	corpus1 = [eve for eve in file1]
with open(r"train_data.txt", "r", encoding="utf-8") as file2:
	corpus2 = [eve for eve in file2]
with open(r"vocabulary.txt","w",encoding="utf-8") as file3:
    word_list = []
    corpus = corpus1 + corpus2
    for line in corpus:
        for word in line:
            word_list.append(word)
    word_list = list(set(word_list))
    for word in word_list:
        file3.write(word + "\n")

4 tensorflow + LSTM实现文本情感二分类

按照上图所示模型,具体程序如下所示。


with open(r"train_data.txt", "r", encoding="utf-8") as file1:
    corpus = [eve.strip("\n") for eve in file1]
with open(r"vocabulary.txt", "r", encoding="utf-8") as file2:
    vocabulary = [word.strip("\n") for word in file2]
with open(r"train_label.txt", "r", encoding="utf-8") as file3:
    label_list = [int(eve.strip("\n")) for eve in file3]
assert len(label_list) == len(corpus)

word2id = {word:id_ for id_, word in enumerate(vocabulary)}
line2id = lambda line: [word2id[word] for word in line]  # 将一句话转化为id的列表list
train_list = [line2id(line) for line in corpus]

import tensorflow.contrib.keras as kr
train_x = kr.preprocessing.sequence.pad_sequences(train_list, 100)  # 长度一致train_x
train_y = kr.utils.to_categorical(label_list, num_classes=2)
import tensorflow as tf
tf.compat.v1.reset_default_graph()
X_holder = tf.compat.v1.placeholder(tf.int32, [None, 100])  # 占位
Y_holder = tf.compat.v1.placeholder(tf.float32, [None, 2])
# 做词嵌入工作 注意2775是自由生成的行向量,必须大于等于实际的数据
embedding = tf.compat.v1.get_variable('embedding', [2775, 60])  # 一种初始化变量的方法,随机初始化了矩阵变量
embedding_inputs = tf.nn.embedding_lookup(embedding, X_holder)
num_hidden_units = 256  # LSTM细胞隐藏层大小
num_fc1_units = 64 # 第1个全连接下一层的大小
dropout_keep_probability = 0.5  # dropout保留比例
learning_rate = 1e-2 # 学习率
# 整体结构:输入-LSTM-全连接层-dropout-全连接层-二分类softmax层
gru_cell = tf.contrib.rnn.BasicLSTMCell(num_hidden_units)  # 实例化一个LSTM单元,此处也可以直接替换为GRU单元
outputs, _ = tf.nn.dynamic_rnn(gru_cell,
                                   embedding_inputs,
                                   dtype=tf.float32)  # _为最后一个cell的state
last_cell = outputs[:, -1, :]  # 取出最后一个cell的outputs做输出,多对一
full_connect1 = tf.layers.dense(last_cell,
                                num_fc1_units)
full_connect1_dropout = tf.contrib.layers.dropout(full_connect1,
                                                  dropout_keep_probability)
full_connect1_activate = tf.nn.relu(full_connect1_dropout)
full_connect2 = tf.layers.dense(full_connect1_activate, 2)  # 连接二分类器
predict_Y = tf.nn.softmax(full_connect2)  # 激活

cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=Y_holder, logits=full_connect2)
loss = tf.reduce_mean(cross_entropy)  # 熵的平均值
optimizer = tf.train.AdamOptimizer(learning_rate)  # 定义优化器
train = optimizer.minimize(loss)  # 将优化器与损失值连接起来

isCorrect = tf.equal(tf.argmax(Y_holder, 1), tf.argmax(predict_Y, 1))  # 判断是否正确
accuracy = tf.reduce_mean(tf.cast(isCorrect, tf.float32))  # 判断准确率

init = tf.global_variables_initializer()  # 初始化参数
session = tf.Session()
session.run(init)

# 读取测试集进行测试
with open(r"test_data.txt", "r", encoding="utf-8") as file4:
    corpus_ = [eve.strip("\n") for eve in file4]
with open(r"test_label.txt", "r", encoding="utf-8") as file5:
    label_list_ = [int(eve.strip("\n")) for eve in file5]
assert len(label_list_) == len(corpus_)
test_list = [line2id(line) for line in corpus_]
test_x = kr.preprocessing.sequence.pad_sequences(test_list, 100)  # 长度一致train_x
test_y = kr.utils.to_categorical(label_list_, num_classes=2)
import random
for i in range(3000):
    selected_index = random.sample(list(range(len(train_y))), k=60)  # 批训练
    batch_X = train_x[selected_index]
    batch_Y = train_y[selected_index]
    session.run(train, {X_holder:batch_X, Y_holder:batch_Y})
    step = i + 1
    if step % 100 == 0:
        selected_index = random.sample(list(range(len(test_y))), k=150)
        batch_X = test_x[selected_index]
        batch_Y = test_y[selected_index]
        loss_value, accuracy_value = session.run([loss, accuracy], {X_holder:batch_X, Y_holder:batch_Y})
        print('step:%d loss:%.4f accuracy:%.4f' %(step, loss_value, accuracy_value))

6 运行结果

step:100 loss:0.4196 accuracy:0.8400
step:200 loss:0.2480 accuracy:0.9000
step:300 loss:0.2244 accuracy:0.9200
step:400 loss:0.1739 accuracy:0.9333
step:500 loss:0.3256 accuracy:0.9000
step:600 loss:0.3013 accuracy:0.9000
step:700 loss:0.1742 accuracy:0.9333
step:800 loss:0.3055 accuracy:0.9000
step:900 loss:0.2212 accuracy:0.9067
step:1000 loss:0.3092 accuracy:0.8800
step:1100 loss:0.2734 accuracy:0.9000
step:1200 loss:0.2690 accuracy:0.8733
step:1300 loss:0.2409 accuracy:0.9067
step:1400 loss:0.3447 accuracy:0.8800
step:1500 loss:0.2810 accuracy:0.8933
step:1600 loss:0.2851 accuracy:0.9200
step:1700 loss:0.2294 accuracy:0.9333
step:1800 loss:0.2828 accuracy:0.8933
step:1900 loss:0.3330 accuracy:0.8933
step:2000 loss:0.3764 accuracy:0.9200
step:2100 loss:0.1498 accuracy:0.9200
step:2200 loss:0.1861 accuracy:0.9467
step:2300 loss:0.2425 accuracy:0.9133
step:2400 loss:0.2472 accuracy:0.9333
step:2500 loss:0.3103 accuracy:0.8933
step:2600 loss:0.2019 accuracy:0.9200
step:2700 loss:0.1626 accuracy:0.9333
step:2800 loss:0.5135 accuracy:0.9200
step:2900 loss:0.3478 accuracy:0.9000
step:3000 loss:0.3352 accuracy:0.9133
  • 1
    点赞
  • 9
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值