[TensorFlow深度学习深入]实战三·使用Word2Vec与RNN(LSTM)做文本情感分析(机器如何读懂人心)
用到了
不太清楚的可以回顾我们之前的博文。
使用了全连接,卷积神经网络与循环神经网络分别实现了.
代码部分:
1.全连接实现
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import numpy as np
import pandas as pd
import pickle
import time
import tensorflow as tf
import collections
from tensorflow import keras
reviews = pd.read_csv('./2RNN/txt_deal/reviews.txt', header=None)
labels = pd.read_csv('./2RNN/txt_deal/labels.txt', header=None)
reviews_datas = reviews.values
labels_datas = labels.values
chars = [""," "]
def get_words(npll):
words = []
for ii in npll:
for i in ii[0].split(" "):
if(i in chars):
pass
else:
words.append(i)
return(words)
words = get_words(reviews_datas)
vocab_size = 10000
vocab = collections.Counter(words).most_common(vocab_size-1)
#print((vocab))
count = [["<PAD>", 0]]
count.extend(vocab)
#print(count[:10])
word2id = {}
id2word = {}
for i, w in enumerate(count):
word2id[w[0]] = i
id2word[i] = w[0]
print(id2word[100], word2id['i'])
reviews_seq = [seq[0].split(" ") for seq in reviews_datas]
reviews_list = []
seq_len = 256
for seq in reviews_seq:
l = [1]
for s in seq:
if s in word2id:
pass
else:
s = "<PAD>"
l.append(word2id[s])
if(len(l)>=seq_len):
l=l[:seq_len]
while(len(l)<seq_len):
l.append(0)
reviews_list.append(l)
reviews_list = np.array(reviews_list)
labels_list = pd.get_dummies(labels).values
x_val = reviews_list[:5000]
partial_x_train = reviews_list[5000:]
y_val = labels_list[:5000]
partial_y_train = labels_list[5000:]
labels = np.argmax(labels_list,axis=1)
print(reviews_list[0],labels[0])
print(reviews_list[0],len(reviews_list))
train_data = reviews_list
train_labels = labels
train_rate=0.0001
train_step=20
batch_size=500
embed_size = 32
sequence_length = 256
n_classes = 2
h1_num = 32
h2_num = 16
h3_num = 2
x = tf.placeholder(tf.int32,shape=[None,sequence_length],name="inputx")
y = tf.placeholder(dtype=tf.float32,shape=[None,2],name="expected_y")
print(y)
embeddings = tf.Variable(tf.random_normal([vocab_size, embed_size]))
x_1 = tf.nn.embedding_lookup(embeddings,x)
#(-1,256)-->(-1,256,32)
h1 = tf.keras.layers.GlobalAveragePooling1D()(x_1)
#(-1,256,32)-->(-1,32)
weights2 = tf.Variable(tf.random_normal(shape=[h1_num,h2_num]))
bias2 = tf.Variable(tf.fill([h2_num],0.1))
#(-1,32)-->(-1,16)
h2 = tf.nn.relu(tf.matmul(h1,weights2)+bias2)
#(-1,16)-->(-1,2)
#y_ = tf.nn.softmax(tf.matmul(h3,weights4)+bias4)
weights3 = tf.Variable(tf.random_normal(shape=[h2_num,h3_num]))
bias3 = tf.Variable(tf.fill([h3_num],0.1))
predy = (tf.matmul(h2,weights3)+bias3)
cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y,logits=predy))
opt=tf.train.AdamOptimizer().minimize(cost)
correct_pred=tf.equal(tf.argmax(predy,1),tf.argmax(y,1))
accuracy=tf.reduce_mean(tf.cast(correct_pred,tf.float32))
with tf.Session() as sess:
saver = tf.train.Saver()
srun = sess.run
init = tf.global_variables_initializer()
srun(init)
for e in range(train_step):
for t in range(20000//batch_size):
ts = int(t*batch_size)
batch_x,batch_y = partial_x_train[ts:ts+batch_size],partial_y_train[ts:ts+batch_size]
srun(opt,{x:batch_x,y:batch_y})
if(t%1==0):
accuracy_val, cost_val = srun([accuracy,cost],{x:batch_x,y:batch_y})
print(e,t,cost_val,accuracy_val)
saver.save(sess,'./2RNN/3_1Word2Vec/txt/saver/model.ckpt',global_step=t)
accuracy_val, cost_val = srun([accuracy,cost],{x:x_val,y:y_val})
print(e,cost_val,accuracy_val)
输出结果
。。。
19 37 0.29615197 0.906
19 38 0.31939483 0.87
19 39 0.4328907 0.81
19 0.42973673 0.8094
2.CNN实现
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import numpy as np
import pandas as pd
import pickle
import time
import tensorflow as tf
import collections
from tensorflow import keras
reviews = pd.read_csv('./2RNN/txt_deal/reviews.txt', header=None)
labels = pd.read_csv('./2RNN/txt_deal/labels.txt', header=None)
reviews_datas = reviews.values
labels_datas = labels.values
chars = [""," "]
def get_words(npll):
words = []
for ii in npll:
for i in ii[0].split(" "):
if(i in chars):
pass
else:
words.append(i)
return(words)
words = get_words(reviews_datas)
vocab_size = 10000
vocab = collections.Counter(words).most_common(vocab_size-1)
#print((vocab))
count = [["<PAD>", 0]]
count.extend(vocab)
#print(count[:10])
word2id = {}
id2word = {}
for i, w in enumerate(count):
word2id[w[0]] = i
id2word[i] = w[0]
print(id2word[100], word2id['i'])
reviews_seq = [seq[0].split(" ") for seq in reviews_datas]
reviews_list = []
seq_len = 256
for seq in reviews_seq:
l = [1]
for s in seq:
if s in word2id:
pass
else:
s = "<PAD>"
l.append(word2id[s])
if(len(l)>=seq_len):
l=l[:seq_len]
while(len(l)<seq_len):
l.append(0)
reviews_list.append(l)
reviews_list = np.array(reviews_list)
labels_list = pd.get_dummies(labels).values
x_val = reviews_list[:5000]
partial_x_train = reviews_list[5000:]
y_val = labels_list[:5000]
partial_y_train = labels_list[5000:]
train_rate=0.0001
train_step=50
batch_size=500
embed_size = 16
sequence_length = 256
n_classes = 2
h1_num = 32
h2_num = 16
h3_num = 2
#(-1,256)
x = tf.placeholder(tf.int32,shape=[None,sequence_length],name="inputx")
embeddings = tf.Variable(tf.random_normal([vocab_size, embed_size]))
#(-1,256)->#(-1,256,32)
x_1 = tf.nn.embedding_lookup(embeddings,x)
y=tf.placeholder(dtype=tf.float32,shape=[None,h3_num],name="expected_y")
def CNN(x):
#先把输入转换为cnn接受的形状:batch_size,sequence_length,frame_size,deepsize
#(-1,256,16)->(-1,256,16,1)
x = tf.reshape(x,[-1,sequence_length,embed_size,1])
#(-1,256,16,1)->(-1,128,8,1)
pool0 = tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
#第一层:卷积层
#(-1,128,8,1)->(-1,128,8,32)
conv1_weights = tf.get_variable("conv1_weights", [5, 5, 1, 32], initializer=tf.truncated_normal_initializer(stddev=0.1)) #过滤器大小为5*5, 当前层深度为1, 过滤器的深度为32
conv1_biases = tf.get_variable("conv1_biases", [32], initializer=tf.constant_initializer(0.0))
conv1 = tf.nn.conv2d(pool0, conv1_weights, strides=[1, 1, 1, 1], padding='SAME') #移动步长为1, 使用全0填充
relu1 = tf.nn.relu(tf.nn.bias_add(conv1, conv1_biases)) #激活函数Relu去线性化
#第二层:最大池化层
#池化层过滤器的大小为2*2, 移动步长为2,使用全0填充
#(-1,128,8,32)->(-1,64,4,32)
pool1 = tf.nn.max_pool(relu1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
#第三层:卷积层
#(-1,64,4,32)->(-1,64,4,64)
conv2_weights = tf.get_variable("conv2_weights", [3, 3, 32, 64], initializer=tf.truncated_normal_initializer(stddev=0.1)) #过滤器大小为5*5, 当前层深度为32, 过滤器的深度为64
conv2_biases = tf.get_variable("conv2_biases", [64], initializer=tf.constant_initializer(0.0))
conv2 = tf.nn.conv2d(pool1, conv2_weights, strides=[1, 1, 1, 1], padding='SAME') #移动步长为1, 使用全0填充
relu2 = tf.nn.relu( tf.nn.bias_add(conv2, conv2_biases) )
#第四层:最大池化层
#池化层过滤器的大小为2*2, 移动步长为2,使用全0填充
#(-1,64,4,64)->(-1,32,2,64)
pool2 = tf.nn.max_pool(relu2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')
#第五层:全连接层
fc1_weights = tf.get_variable("fc1_weights", [32 * 2 * 64, 256], initializer=tf.truncated_normal_initializer(stddev=0.1)) #7*7*64=3136把前一层的输出变成特征向量
fc1_baises = tf.get_variable("fc1_baises", [256], initializer=tf.constant_initializer(0.1))
#(-1,32,2,64)->(-1,32*2*64)
pool2_vector = tf.reshape(pool2, [-1, 32 * 2 * 64])
#(-1,32*2*64)->(-1,256)
fc1 = tf.nn.relu(tf.matmul(pool2_vector, fc1_weights) + fc1_baises)
fc2_weights = tf.get_variable("fc2_weights", [256, 2], initializer=tf.truncated_normal_initializer(stddev=0.1)) #7*7*64=3136把前一层的输出变成特征向量
fc2_baises = tf.get_variable("fc2_baises", [2], initializer=tf.constant_initializer(0.1))
#(-1,256)->(-1,2)
h2 = tf.matmul(fc1, fc2_weights) + fc2_baises
return (h2)
#为了减少过拟合,加入Dropout层
predy = CNN(x_1)
cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y,logits=predy))
opt=tf.train.AdamOptimizer().minimize(cost)
correct_pred=tf.equal(tf.argmax(predy,1),tf.argmax(y,1))
accuracy=tf.reduce_mean(tf.cast(correct_pred,tf.float32))
with tf.Session() as sess:
saver = tf.train.Saver()
srun = sess.run
init = tf.global_variables_initializer()
srun(init)
for e in range(train_step):
for t in range(20000//batch_size):
ts = int(t*batch_size)
batch_x,batch_y = partial_x_train[ts:ts+batch_size],partial_y_train[ts:ts+batch_size]
srun(opt,{x:batch_x,y:batch_y})
if(t%1==0):
accuracy_val, cost_val = srun([accuracy,cost],{x:batch_x,y:batch_y})
print(e,t,cost_val,accuracy_val)
saver.save(sess,'/Users/yss/YSSFiles/TFAPP/2RNN/txt_deal/saver/model',global_step=e)
accuracy_val, cost_val = srun([accuracy,cost],{x:x_val,y:y_val})
print(e,cost_val,accuracy_val)
输出结果
。。。
39 37 0.11854752 0.974
39 38 0.05035739 0.994
39 39 0.025472356 1.0
39 1.1234461 0.657
3.RNN实现
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import numpy as np
import pandas as pd
import pickle
import time
import tensorflow as tf
import collections
from tensorflow import keras
reviews = pd.read_csv('./2RNN/txt_deal/reviews.txt', header=None)
labels = pd.read_csv('./2RNN/txt_deal/labels.txt', header=None)
reviews_datas = reviews.values
labels_datas = labels.values
chars = [""," "]
def get_words(npll):
words = []
for ii in npll:
for i in ii[0].split(" "):
if(i in chars):
pass
else:
words.append(i)
return(words)
words = get_words(reviews_datas)
vocab_size = 10000
vocab = collections.Counter(words).most_common(vocab_size-1)
#print((vocab))
count = [["<PAD>", 0]]
count.extend(vocab)
#print(count[:10])
word2id = {}
id2word = {}
for i, w in enumerate(count):
word2id[w[0]] = i
id2word[i] = w[0]
print(id2word[100], word2id['i'])
reviews_seq = [seq[0].split(" ") for seq in reviews_datas]
reviews_list = []
seq_len = 256
for seq in reviews_seq:
l = [1]
for s in seq:
if s in word2id:
pass
else:
s = "<PAD>"
l.append(word2id[s])
if(len(l)>=seq_len):
l=l[:seq_len]
while(len(l)<seq_len):
l.append(0)
reviews_list.append(l)
reviews_list = np.array(reviews_list)
labels_list = pd.get_dummies(labels).values
x_val = reviews_list[:5000]
partial_x_train = reviews_list[5000:]
y_val = labels_list[:5000]
partial_y_train = labels_list[5000:]
train_rate=0.0001
train_step=50
batch_size=500
embed_size = 32
sequence_length = 256
n_classes = 2
h1_num = 32
h2_num = 16
h3_num = 2
#(-1,256)
x = tf.placeholder(tf.int32,shape=[None,sequence_length],name="inputx")
embeddings = tf.Variable(tf.random_normal([vocab_size, embed_size]))
#(-1,256)->#(-1,256,32)
x_1 = tf.nn.embedding_lookup(embeddings,x)
y=tf.placeholder(dtype=tf.float32,shape=[None,h3_num],name="expected_y")
weights={
"h1":tf.Variable(tf.random_normal(shape=[h1_num,h2_num])),
"h2":tf.Variable(tf.random_normal(shape=[h2_num,h3_num])),
}
bias={
"h1":tf.Variable(tf.fill([h2_num],0.1)),
"h2":tf.Variable(tf.fill([h3_num],0.1)),
}
def RNN(x,weights,bias):
#先把输入转换为dynamic_rnn接受的形状:batch_size,sequence_length,frame_size
rnn_cell=tf.nn.rnn_cell.BasicLSTMCell(h1_num)
output,states=tf.nn.dynamic_rnn(rnn_cell,x,dtype=tf.float32)
h = tf.matmul(output[:,-1,:],weights)+bias
#此时output就是一个[batch_size,sequence_length,rnn_cell.output_size]形状的tensor
return (h)
h2 = tf.nn.relu(RNN(x_1,weights["h1"],bias["h1"]))
predy = tf.matmul(h2,weights["h2"])+bias["h2"]
cost=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(labels=y,logits=predy))
opt=tf.train.AdamOptimizer().minimize(cost)
correct_pred=tf.equal(tf.argmax(predy,1),tf.argmax(y,1))
accuracy=tf.reduce_mean(tf.cast(correct_pred,tf.float32))
with tf.Session() as sess:
saver = tf.train.Saver()
srun = sess.run
init = tf.global_variables_initializer()
srun(init)
for e in range(train_step):
for t in range(20000//batch_size):
ts = int(t*batch_size)
batch_x,batch_y = partial_x_train[ts:ts+batch_size],partial_y_train[ts:ts+batch_size]
srun(opt,{x:batch_x,y:batch_y})
if(t%1==0):
accuracy_val, cost_val = srun([accuracy,cost],{x:batch_x,y:batch_y})
print(e,t,cost_val,accuracy_val)
saver.save(sess,'/Users/yss/YSSFiles/TFAPP/2RNN/txt_deal/saver/model',global_step=e)
accuracy_val, cost_val = srun([accuracy,cost],{x:x_val,y:y_val})
print(e,cost_val,accuracy_val)
输出结果
。。。
49 37 0.21631543 0.92
49 38 0.21078381 0.924
49 39 0.36801508 0.854
49 0.82634455 0.7292