# 文本分类RNN
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
import numpy as np
import re
from tensorflow.contrib import learn
from tensorflow.contrib.layers import fully_connected
import os
print(help(tf.contrib.layers))
# os.environ["TF_CPP_MIN_LOG_LEVEL"]='3'
# tf.set_random_seed(777) #设置随机种子
# test_str = ' abcde\n\r'.strip()
# print(test_str)
learning_rate = 0.001 #学习率
training_epochs = 10 #训练总周期
dev_sample_percentage = 0.1 #测试集的比例
pos_data_file = './rt-polarity.pos'
neg_data_file = './rt-polarity.neg'
# with open(pos_data_file, 'r', encoding='utf8') as f:
# f.readlines()
positive_examples = open(pos_data_file, 'r', encoding='utf8').readlines()
# positive_examples = [s.strip() for s in positive_examples]
negative_examples = list(open(neg_data_file, "r", encoding='utf-8').readlines())
# negative_examples = [s.strip() for s in negative_examples]
print(len(positive_examples), len(negative_examples))
print(positive_examples[:1], negative_examples[:1])
x_text = positive_examples + negative_examples #把所有正面和负面文本拼接起来
print(len(x_text))
#生成标签
positive_labels = [[0, 1] for _ in positive_examples] # 独热编码,正面
negative_labels = [[1, 0] for _ in negative_examples] # 独热编码,负面
print(np.array(positive_labels).shape)
# y_data = np.concatenate([positive_labels, negative_labels], axis=0)
import numpy as np
y_data = np.array(positive_labels + negative_labels)
print(y_data.shape)
# print([sentence.split(' ') for sentence in x_text][0])
max_document_length = max([len(sentence.split(' ')) for sentence in x_text])
print('一个句子最大的单词数:', max_document_length) #(临时测试用)
print("*****************************")
# print(help(learn.preprocessing.VocabularyProcessor))
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length=max_document_length)
print(">>>>>>>>>>>>>>>")
print(x_text[0])
print(">>>>>>>>>>>>>>>")
print(type(vocab_processor.fit_transform(x_text)))
print(list(vocab_processor.fit_transform(x_text))[2])
print(len(x_text[2].split(' ')))
print(len(list(vocab_processor.fit_transform(x_text))[2]))
# print(vocab_processor.vocabulary_._mapping)
x = np.array(list(vocab_processor.fit_transform(x_text)))
print('句子的编码前',x_text[:2])
print('单词总数:', len(vocab_processor.vocabulary_)) #(临时测试用)
print('句子的编码后', x[:2]) #(临时测试用)
from sklearn.model_selection import train_test_split
print(x.shape)
print(y_data.shape)
x_trn_data, x_text_data, y_trn_data, y_test_data = train_test_split(x, y_data, test_size=0.2, shuffle=True)
print(x_trn_data.shape)
print(y_trn_data.shape)
g_b=0
# 自己实现next_batch函数,每次返回一批数据
def next_batch(size):
global g_b
xb = x_trn_data[g_b:g_b+size]
yb = y_trn_data[g_b:g_b+size]
g_b = g_b + size
return xb,yb
n_neurons = 128 #隐藏状态,神经元个数
n_outputs = 2 #输出2分类
n_layers = 3 #层数
embedding_size = 100 # 词向量的维度
batch_size = 64 #每批样本
n_steps = max_document_length #时间步数(序列长度)
n_inputs = embedding_size #输入数据长度
import tensorflow as tf
X = tf.placeholder(tf.int32, [None, max_document_length])
Y = tf.placeholder(tf.int32, [None, 2])
# 加入嵌入层
print(">>>>>>>>>>> vocabulary size: ", len(vocab_processor.vocabulary_))
# W = tf.Variable(tf.random_uniform([len(vocab_processor.vocabulary_), embedding_size], -1, 1))
W = tf.Variable(tf.random_normal([len(vocab_processor.vocabulary_), embedding_size]))
X_data = tf.nn.embedding_lookup(W, X)
print(X_data)
# print(help(tf.contrib.rnn))
cells = [tf.contrib.rnn.LSTMCell(n_neurons) for _ in range(0, n_layers)]
print(help(tf.nn.dynamic_rnn))
multi_cell = tf.contrib.rnn.MultiRNNCell(cells)
outputs, states = tf.nn.dynamic_rnn(multi_cell, X_data, dtype=tf.float32)
print(">>>>>>>>>>>>>>>>>>>>>")
print(outputs.shape)
# print(states.shape)
# tuple()
# list()
# tf.reshape(-1, )
logits = fully_connected(outputs[:, -1], n_outputs)
# print("full connected??? ", outputs.shape[-1].value)
# print(type(outputs))
# W_full = tf.Variable(tf.random_normal([outputs.shape[-1].value, n_outputs]))
# b = tf.Variable(tf.random_normal([n_outputs]))
# logits = tf.matmul(outputs[:, -1], W_full) + b
# from tensorflow.python.framework.tensor_shape.Dimension
# 代价或损失函数
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # 优化器
# tf.summary.scalar("loss", cost)
# summary = tf.summary.merge_all()
# global_step = 0
# 创建会话
sess = tf.Session()
sess.run(tf.global_variables_initializer()) #全局变量初始化
# writer = tf.summary.FileWriter(TB_SUMMARY_DIR, sess.graph)
total = x_trn_data.shape[0]
correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print('开始学习...')
for epoch in range(training_epochs):
avg_cost = 0
total_batch = int(total / batch_size) #计算总批次
g_b = 0
for i in range(total_batch):
batch_xs, batch_ys = next_batch(batch_size)
feed_dict = {X: batch_xs, Y: batch_ys}
# s, c, _ = sess.run([summary, cost, optimizer], feed_dict=feed_dict)
c, _ = sess.run([cost, optimizer], feed_dict=feed_dict)
avg_cost += c / total_batch
# writer.add_summary(s, global_step=global_step)
# global_step = global_step + 1
acc = sess.run(accuracy, feed_dict={X: x_text_data, Y: y_test_data})
print('Epoch:', (epoch + 1), 'cost =', avg_cost, 'acc=', acc)
print('学习完成')
# 测试模型检查准确率
print('Accuracy:', sess.run(accuracy, feed_dict={X: x_text_data, Y: y_test_data}))
文本分类RNN
最新推荐文章于 2024-07-23 14:36:35 发布