Tensorflow twitter-RNN-评论情感分析(l2正则)-CSDN博客

github： github.com/yangjinghit…

import pandas as pd
import numpy as np
复制代码

data = pd.read_csv('Tweets.csv')

复制代码

data.head(2)
复制代码

	tweet_id	airline_sentiment	airline_sentiment_confidence	negativereason	negativereason_confidence	airline	airline_sentiment_gold	name	negativereason_gold	retweet_count	text	tweet_coord	tweet_created	tweet_location	user_timezone
0	570306133677760513	neutral	1.0000	NaN	NaN	Virgin America	NaN	cairdin	NaN	0	@VirginAmerica What @dhepburn said.	NaN	2015-02-24 11:35:52 -0800	NaN	Eastern Time (US & Canada)
1	570301130888122368	positive	0.3486	NaN	0.0	Virgin America	NaN	jnardino	NaN	0	@VirginAmerica plus you've added commercials t...	NaN	2015-02-24 11:15:59 -0800	NaN	Pacific Time (US & Canada)

data = data[['airline_sentiment', 'text']]
复制代码

with open('twee', 'a', encoding = 'utf-8') as f:
    for string in data.text:
        f.writelines(string+'\n')
复制代码

from gensim.models import word2vec
复制代码

sentences = word2vec.Text8Corpus("twee")
model = word2vec.Word2Vec(sentences, size=300)
复制代码

word_vectors = model.wv
del model
复制代码

data['vec'] = data.text.apply(lambda x : [word_vectors[w] for w in x.split() if w in word_vectors])
复制代码

data = data[data['vec'].apply(lambda x : len(x)>5)]
复制代码

data.head(3)
复制代码

	airline_sentiment	text	vec
1	positive	@VirginAmerica plus you've added commercials t...	[[2.2402475, 0.15890086, -0.082046695, 0.80472...
2	neutral	@VirginAmerica I didn't today... Must mean I n...	[[2.2402475, 0.15890086, -0.082046695, 0.80472...
3	negative	@VirginAmerica it's really aggressive to blast...	[[2.2402475, 0.15890086, -0.082046695, 0.80472...

del data['text']
复制代码

data.airline_sentiment.unique()
复制代码

array(['positive', 'neutral', 'negative'], dtype=object)
复制代码

data.airline_sentiment.value_counts()
复制代码

negative    9007
neutral     2789
positive    2013
Name: airline_sentiment, dtype: int64
复制代码

dic = {'neutral':np.array([1,0,0]), 'positive':np.array([0,1,0]), 'negative':np.array([0,0,1])}
复制代码

data['cat'] = data.airline_sentiment.map(dic)
复制代码

del data['airline_sentiment']
复制代码

data.columns
复制代码

Index(['vec', 'cat'], dtype='object')
复制代码

data = data.reset_index()
del data['index']
复制代码

maxlength = max(len(x) for x in data.vec)
复制代码

maxlength
复制代码

36
复制代码

data.head(2)
复制代码

	vec	cat
0	[[2.2402475, 0.15890086, -0.082046695, 0.80472...	[0, 1, 0]
1	[[2.2402475, 0.15890086, -0.082046695, 0.80472...	[1, 0, 0]

def pad(x):
    xl = np.zeros((maxlength, 300))
    xl[:len(x)] = x
    return xl
复制代码

dataset = data.vec.apply(pad)
复制代码

dataset.head(2)
复制代码

0    [[2.2402474880218506, 0.15890085697174072, -0....
1    [[2.2402474880218506, 0.15890085697174072, -0....
Name: vec, dtype: object
复制代码

len(dataset)
复制代码

13809
复制代码

labels = np.concatenate(data.cat).reshape(len(data.cat), -1)

复制代码

np.shape(labels)
复制代码

(13809, 3)
复制代码

data_ = np.concatenate(dataset).reshape(len(dataset), maxlength, 300)
复制代码

np.shape(data_)
复制代码

(13809, 36, 300)
复制代码

index = np.random.permutation(int(len(data)))
复制代码

label = labels[index]
dataset = data_[index]
复制代码

label_train = label[:12000]
dataset_train = dataset[:12000]
label_test = label[12000:]
dataset_test = dataset[12000:]
复制代码

import tensorflow as tf
复制代码

/anaconda3/envs/py35/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5
  return f(*args, **kwds)
/anaconda3/envs/py35/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
复制代码

learning_rate = 0.005
batch_size = 300
n_input = 300
n_steps = maxlength
n_hidden = 128
n_classes = 3
复制代码

x = tf.placeholder(tf.float32, [None, n_steps, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
output_keep_prob = tf.placeholder("float")
复制代码

reg = tf.contrib.layers.l2_regularizer(scale=0.01)

复制代码

def length(shuju):
    return tf.reduce_sum(tf.sign(tf.reduce_max(tf.abs(shuju),reduction_indices=2)), reduction_indices=1)
复制代码

cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(n_hidden,
                                                           kernel_initializer = tf.truncated_normal_initializer(stddev= 0.0001),
                                                           bias_initializer = tf.truncated_normal_initializer(stddev=0.0001)),
                                    output_keep_prob = output_keep_prob)

复制代码

output, _ = tf.nn.dynamic_rnn(
            cell,
            x,
            dtype=tf.float32,
            sequence_length= length(x))
复制代码

output.get_shape()
复制代码

TensorShape([Dimension(None), Dimension(36), Dimension(128)])
复制代码

index = tf.range(0, batch_size)*n_steps + (tf.cast(length(x), tf.int32) -1)
flat = tf.reshape(output, [-1, int(output.get_shape()[2])])
last = tf.gather(flat, index)
复制代码

fc_1 = tf.contrib.layers.fully_connected(
                        last,
                        64,
                        weights_initializer = tf.truncated_normal_initializer(stddev=0.01),
                        activation_fn = tf.nn.relu)
keep_prob = tf.placeholder("float")
fc1_drop = tf.nn.dropout(fc_1, keep_prob)
复制代码

weight = tf.Variable(tf.truncated_normal([64, n_classes],stddev=0.001))
bias = tf.Variable(tf.constant(0.1, shape=[n_classes]))
prediction = tf.nn.softmax(tf.matmul(fc1_drop, weight) + bias)
复制代码

cross_entropy = -tf.reduce_sum(y * tf.log(prediction))
复制代码

weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
复制代码

tf.contrib.layers.apply_regularization(reg, weights_list=weights)
复制代码

<tf.Tensor 'get_regularization_penalty:0' shape=() dtype=float32>
复制代码

reg_ws = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
复制代码

optimizer = tf.train.AdamOptimizer(learning_rate,beta1=0.9)
grads = optimizer.compute_gradients(cross_entropy + tf.reduce_sum(reg_ws))
for i, (g,v) in enumerate(grads):
    if g is not None:
        grads[i] = (tf.clip_by_norm(g, 5), v)
train_op = optimizer.apply_gradients(grads)
复制代码

/anaconda3/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py:97: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


WARNING:tensorflow:From /anaconda3/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/clip_ops.py:110: calling reduce_sum (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
复制代码

correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

复制代码

def generatebatch(X,Y, n_examples, batch_size):
    for batch_i in range(n_examples // batch_size):
        start = batch_i*batch_size
        end = start + batch_size
        batch_xs = X[start:end]
        batch_ys = Y[start:end]
        yield batch_xs, batch_ys
复制代码

sess = tf.Session()

复制代码

init = tf.global_variables_initializer()
sess.run(init)
复制代码

saver = tf.train.Saver()
复制代码

for step in range(18):
    index_= np.random.permutation(int(len(dataset_train)))
    dataset_train = dataset_train[index_]
    label_train = label_train[index_]
    for batch_x, batch_y in generatebatch(dataset_train, label_train, len(label_train), batch_size):
        sess.run(train_op, feed_dict={x:batch_x, y:batch_y, keep_prob:0.5, output_keep_prob:0.5})
    acc = sess.run(accuracy, feed_dict={x:batch_x, y:batch_y, keep_prob:1, output_keep_prob:1})
    loss = sess.run(cross_entropy, feed_dict={x:batch_x, y:batch_y, keep_prob:1, output_keep_prob:1})
    saver.save(sess, './lesson0', global_step=step)
    print("Iter" + str(step) + "MiniBatch Loss =" + "{:.6f}".format(loss) + ", Training Accuracy = " + "{:.5f}".format(acc))
print("Optimization Finished!")
复制代码

Iter0MiniBatch Loss =214.256958, Training Accuracy = 0.66667
Iter1MiniBatch Loss =173.106171, Training Accuracy = 0.76333
Iter2MiniBatch Loss =163.925598, Training Accuracy = 0.80333
Iter3MiniBatch Loss =158.836716, Training Accuracy = 0.77667
Iter4MiniBatch Loss =155.008820, Training Accuracy = 0.79667
Iter5MiniBatch Loss =131.040298, Training Accuracy = 0.83667
Iter6MiniBatch Loss =133.507889, Training Accuracy = 0.80667
Iter7MiniBatch Loss =114.443909, Training Accuracy = 0.86333
Iter8MiniBatch Loss =103.080223, Training Accuracy = 0.86333
Iter9MiniBatch Loss =99.932602, Training Accuracy = 0.90000
Iter10MiniBatch Loss =93.207428, Training Accuracy = 0.86000
Iter11MiniBatch Loss =67.471329, Training Accuracy = 0.93000
Iter12MiniBatch Loss =62.449608, Training Accuracy = 0.92333
Iter13MiniBatch Loss =50.676277, Training Accuracy = 0.93000
Iter14MiniBatch Loss =55.832417, Training Accuracy = 0.92333
Iter15MiniBatch Loss =44.194443, Training Accuracy = 0.96333
Iter16MiniBatch Loss =30.585236, Training Accuracy = 0.95667
Iter17MiniBatch Loss =48.206429, Training Accuracy = 0.94333
Optimization Finished!
复制代码