文本分类RNN

# 文本分类RNN
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
import numpy as np
import re
from tensorflow.contrib import learn
from tensorflow.contrib.layers import fully_connected
import os

print(help(tf.contrib.layers))


# os.environ["TF_CPP_MIN_LOG_LEVEL"]='3'
# tf.set_random_seed(777) #设置随机种子

# test_str = ' abcde\n\r'.strip()
# print(test_str)


learning_rate = 0.001 #学习率
training_epochs = 10 #训练总周期

dev_sample_percentage = 0.1 #测试集的比例

pos_data_file = './rt-polarity.pos'
neg_data_file = './rt-polarity.neg'


# with open(pos_data_file, 'r', encoding='utf8') as f:
# f.readlines()

positive_examples = open(pos_data_file, 'r', encoding='utf8').readlines()
# positive_examples = [s.strip() for s in positive_examples]
negative_examples = list(open(neg_data_file, "r", encoding='utf-8').readlines())
# negative_examples = [s.strip() for s in negative_examples]

print(len(positive_examples), len(negative_examples))
print(positive_examples[:1], negative_examples[:1])

x_text = positive_examples + negative_examples #把所有正面和负面文本拼接起来

print(len(x_text))

#生成标签
positive_labels = [[0, 1] for _ in positive_examples] # 独热编码,正面
negative_labels = [[1, 0] for _ in negative_examples] # 独热编码,负面

print(np.array(positive_labels).shape)

# y_data = np.concatenate([positive_labels, negative_labels], axis=0)

import numpy as np
y_data = np.array(positive_labels + negative_labels)

print(y_data.shape)

# print([sentence.split(' ') for sentence in x_text][0])

max_document_length = max([len(sentence.split(' ')) for sentence in x_text])

print('一个句子最大的单词数:', max_document_length) #(临时测试用)

print("*****************************")
# print(help(learn.preprocessing.VocabularyProcessor))
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length=max_document_length)

print(">>>>>>>>>>>>>>>")
print(x_text[0])
print(">>>>>>>>>>>>>>>")
print(type(vocab_processor.fit_transform(x_text)))

print(list(vocab_processor.fit_transform(x_text))[2])
print(len(x_text[2].split(' ')))
print(len(list(vocab_processor.fit_transform(x_text))[2]))

# print(vocab_processor.vocabulary_._mapping)

x = np.array(list(vocab_processor.fit_transform(x_text)))

print('句子的编码前',x_text[:2])
print('单词总数:', len(vocab_processor.vocabulary_)) #(临时测试用)
print('句子的编码后', x[:2]) #(临时测试用)



from sklearn.model_selection import train_test_split

print(x.shape)
print(y_data.shape)

x_trn_data, x_text_data, y_trn_data, y_test_data = train_test_split(x, y_data, test_size=0.2, shuffle=True)

print(x_trn_data.shape)
print(y_trn_data.shape)


g_b=0
# 自己实现next_batch函数,每次返回一批数据
def next_batch(size):
global g_b
xb = x_trn_data[g_b:g_b+size]
yb = y_trn_data[g_b:g_b+size]
g_b = g_b + size
return xb,yb


n_neurons = 128 #隐藏状态,神经元个数
n_outputs = 2 #输出2分类
n_layers = 3 #层数

embedding_size = 100 # 词向量的维度
batch_size = 64 #每批样本
n_steps = max_document_length #时间步数(序列长度)
n_inputs = embedding_size #输入数据长度

import tensorflow as tf

X = tf.placeholder(tf.int32, [None, max_document_length])
Y = tf.placeholder(tf.int32, [None, 2])

# 加入嵌入层
print(">>>>>>>>>>> vocabulary size: ", len(vocab_processor.vocabulary_))
# W = tf.Variable(tf.random_uniform([len(vocab_processor.vocabulary_), embedding_size], -1, 1))
W = tf.Variable(tf.random_normal([len(vocab_processor.vocabulary_), embedding_size]))
X_data = tf.nn.embedding_lookup(W, X)
print(X_data)

# print(help(tf.contrib.rnn))
cells = [tf.contrib.rnn.LSTMCell(n_neurons) for _ in range(0, n_layers)]
print(help(tf.nn.dynamic_rnn))

multi_cell = tf.contrib.rnn.MultiRNNCell(cells)

outputs, states = tf.nn.dynamic_rnn(multi_cell, X_data, dtype=tf.float32)

print(">>>>>>>>>>>>>>>>>>>>>")
print(outputs.shape)
# print(states.shape)

# tuple()
# list()

# tf.reshape(-1, )

logits = fully_connected(outputs[:, -1], n_outputs)


# print("full connected??? ", outputs.shape[-1].value)
# print(type(outputs))
# W_full = tf.Variable(tf.random_normal([outputs.shape[-1].value, n_outputs]))
# b = tf.Variable(tf.random_normal([n_outputs]))
# logits = tf.matmul(outputs[:, -1], W_full) + b

# from tensorflow.python.framework.tensor_shape.Dimension

# 代价或损失函数
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=Y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) # 优化器

# tf.summary.scalar("loss", cost)
# summary = tf.summary.merge_all()
# global_step = 0
# 创建会话
sess = tf.Session()
sess.run(tf.global_variables_initializer()) #全局变量初始化
# writer = tf.summary.FileWriter(TB_SUMMARY_DIR, sess.graph)

total = x_trn_data.shape[0]

correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(Y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print('开始学习...')
for epoch in range(training_epochs):
avg_cost = 0
total_batch = int(total / batch_size) #计算总批次
g_b = 0
for i in range(total_batch):
batch_xs, batch_ys = next_batch(batch_size)
feed_dict = {X: batch_xs, Y: batch_ys}
# s, c, _ = sess.run([summary, cost, optimizer], feed_dict=feed_dict)
c, _ = sess.run([cost, optimizer], feed_dict=feed_dict)
avg_cost += c / total_batch
# writer.add_summary(s, global_step=global_step)
# global_step = global_step + 1
acc = sess.run(accuracy, feed_dict={X: x_text_data, Y: y_test_data})
print('Epoch:', (epoch + 1), 'cost =', avg_cost, 'acc=', acc)
print('学习完成')

# 测试模型检查准确率
print('Accuracy:', sess.run(accuracy, feed_dict={X: x_text_data, Y: y_test_data}))

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值