Tensorflow twitter-RNN-评论情感分析(l2正则)

github: github.com/yangjinghit…

import pandas as pd
import numpy as np
复制代码
data = pd.read_csv('Tweets.csv')

复制代码
data.head(2)
复制代码
tweet_idairline_sentimentairline_sentiment_confidencenegativereasonnegativereason_confidenceairlineairline_sentiment_goldnamenegativereason_goldretweet_counttexttweet_coordtweet_createdtweet_locationuser_timezone
0570306133677760513neutral1.0000NaNNaNVirgin AmericaNaNcairdinNaN0@VirginAmerica What @dhepburn said.NaN2015-02-24 11:35:52 -0800NaNEastern Time (US & Canada)
1570301130888122368positive0.3486NaN0.0Virgin AmericaNaNjnardinoNaN0@VirginAmerica plus you've added commercials t...NaN2015-02-24 11:15:59 -0800NaNPacific Time (US & Canada)
data = data[['airline_sentiment', 'text']]
复制代码
with open('twee', 'a', encoding = 'utf-8') as f:
    for string in data.text:
        f.writelines(string+'\n')
复制代码
from gensim.models import word2vec
复制代码
sentences = word2vec.Text8Corpus("twee")
model = word2vec.Word2Vec(sentences, size=300)
复制代码
word_vectors = model.wv
del model
复制代码
data['vec'] = data.text.apply(lambda x : [word_vectors[w] for w in x.split() if w in word_vectors])
复制代码
data = data[data['vec'].apply(lambda x : len(x)>5)]
复制代码
data.head(3)
复制代码
airline_sentimenttextvec
1positive@VirginAmerica plus you've added commercials t...[[2.2402475, 0.15890086, -0.082046695, 0.80472...
2neutral@VirginAmerica I didn't today... Must mean I n...[[2.2402475, 0.15890086, -0.082046695, 0.80472...
3negative@VirginAmerica it's really aggressive to blast...[[2.2402475, 0.15890086, -0.082046695, 0.80472...
del data['text']
复制代码
data.airline_sentiment.unique()
复制代码
array(['positive', 'neutral', 'negative'], dtype=object)
复制代码
data.airline_sentiment.value_counts()
复制代码
negative    9007
neutral     2789
positive    2013
Name: airline_sentiment, dtype: int64
复制代码
dic = {'neutral':np.array([1,0,0]), 'positive':np.array([0,1,0]), 'negative':np.array([0,0,1])}
复制代码
data['cat'] = data.airline_sentiment.map(dic)
复制代码
del data['airline_sentiment']
复制代码
data.columns
复制代码
Index(['vec', 'cat'], dtype='object')
复制代码
data = data.reset_index()
del data['index']
复制代码
maxlength = max(len(x) for x in data.vec)
复制代码
maxlength
复制代码
36
复制代码
data.head(2)
复制代码
veccat
0[[2.2402475, 0.15890086, -0.082046695, 0.80472...[0, 1, 0]
1[[2.2402475, 0.15890086, -0.082046695, 0.80472...[1, 0, 0]
def pad(x):
    xl = np.zeros((maxlength, 300))
    xl[:len(x)] = x
    return xl
复制代码
dataset = data.vec.apply(pad)
复制代码
dataset.head(2)
复制代码
0    [[2.2402474880218506, 0.15890085697174072, -0....
1    [[2.2402474880218506, 0.15890085697174072, -0....
Name: vec, dtype: object
复制代码
len(dataset)
复制代码
13809
复制代码
labels = np.concatenate(data.cat).reshape(len(data.cat), -1)

复制代码
np.shape(labels)
复制代码
(13809, 3)
复制代码
data_ = np.concatenate(dataset).reshape(len(dataset), maxlength, 300)
复制代码
np.shape(data_)
复制代码
(13809, 36, 300)
复制代码
index = np.random.permutation(int(len(data)))
复制代码
label = labels[index]
dataset = data_[index]
复制代码
label_train = label[:12000]
dataset_train = dataset[:12000]
label_test = label[12000:]
dataset_test = dataset[12000:]
复制代码
import tensorflow as tf
复制代码
/anaconda3/envs/py35/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5
  return f(*args, **kwds)
/anaconda3/envs/py35/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
复制代码
learning_rate = 0.005
batch_size = 300
n_input = 300
n_steps = maxlength
n_hidden = 128
n_classes = 3
复制代码
x = tf.placeholder(tf.float32, [None, n_steps, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
output_keep_prob = tf.placeholder("float")
复制代码
reg = tf.contrib.layers.l2_regularizer(scale=0.01)

复制代码
def length(shuju):
    return tf.reduce_sum(tf.sign(tf.reduce_max(tf.abs(shuju),reduction_indices=2)), reduction_indices=1)
复制代码
cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(n_hidden,
                                                           kernel_initializer = tf.truncated_normal_initializer(stddev= 0.0001),
                                                           bias_initializer = tf.truncated_normal_initializer(stddev=0.0001)),
                                    output_keep_prob = output_keep_prob)

复制代码
output, _ = tf.nn.dynamic_rnn(
            cell,
            x,
            dtype=tf.float32,
            sequence_length= length(x))
复制代码
output.get_shape()
复制代码
TensorShape([Dimension(None), Dimension(36), Dimension(128)])
复制代码
index = tf.range(0, batch_size)*n_steps + (tf.cast(length(x), tf.int32) -1)
flat = tf.reshape(output, [-1, int(output.get_shape()[2])])
last = tf.gather(flat, index)
复制代码
fc_1 = tf.contrib.layers.fully_connected(
                        last,
                        64,
                        weights_initializer = tf.truncated_normal_initializer(stddev=0.01),
                        activation_fn = tf.nn.relu)
keep_prob = tf.placeholder("float")
fc1_drop = tf.nn.dropout(fc_1, keep_prob)
复制代码
weight = tf.Variable(tf.truncated_normal([64, n_classes],stddev=0.001))
bias = tf.Variable(tf.constant(0.1, shape=[n_classes]))
prediction = tf.nn.softmax(tf.matmul(fc1_drop, weight) + bias)
复制代码
cross_entropy = -tf.reduce_sum(y * tf.log(prediction))
复制代码
weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
复制代码
tf.contrib.layers.apply_regularization(reg, weights_list=weights)
复制代码
<tf.Tensor 'get_regularization_penalty:0' shape=() dtype=float32>
复制代码
reg_ws = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
复制代码
optimizer = tf.train.AdamOptimizer(learning_rate,beta1=0.9)
grads = optimizer.compute_gradients(cross_entropy + tf.reduce_sum(reg_ws))
for i, (g,v) in enumerate(grads):
    if g is not None:
        grads[i] = (tf.clip_by_norm(g, 5), v)
train_op = optimizer.apply_gradients(grads)
复制代码
/anaconda3/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py:97: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


WARNING:tensorflow:From /anaconda3/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/clip_ops.py:110: calling reduce_sum (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
复制代码
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

复制代码
def generatebatch(X,Y, n_examples, batch_size):
    for batch_i in range(n_examples // batch_size):
        start = batch_i*batch_size
        end = start + batch_size
        batch_xs = X[start:end]
        batch_ys = Y[start:end]
        yield batch_xs, batch_ys
复制代码
sess = tf.Session()

复制代码
init = tf.global_variables_initializer()
sess.run(init)
复制代码
saver = tf.train.Saver()
复制代码
for step in range(18):
    index_= np.random.permutation(int(len(dataset_train)))
    dataset_train = dataset_train[index_]
    label_train = label_train[index_]
    for batch_x, batch_y in generatebatch(dataset_train, label_train, len(label_train), batch_size):
        sess.run(train_op, feed_dict={x:batch_x, y:batch_y, keep_prob:0.5, output_keep_prob:0.5})
    acc = sess.run(accuracy, feed_dict={x:batch_x, y:batch_y, keep_prob:1, output_keep_prob:1})
    loss = sess.run(cross_entropy, feed_dict={x:batch_x, y:batch_y, keep_prob:1, output_keep_prob:1})
    saver.save(sess, './lesson0', global_step=step)
    print("Iter" + str(step) + "MiniBatch Loss =" + "{:.6f}".format(loss) + ", Training Accuracy = " + "{:.5f}".format(acc))
print("Optimization Finished!")
复制代码
Iter0MiniBatch Loss =214.256958, Training Accuracy = 0.66667
Iter1MiniBatch Loss =173.106171, Training Accuracy = 0.76333
Iter2MiniBatch Loss =163.925598, Training Accuracy = 0.80333
Iter3MiniBatch Loss =158.836716, Training Accuracy = 0.77667
Iter4MiniBatch Loss =155.008820, Training Accuracy = 0.79667
Iter5MiniBatch Loss =131.040298, Training Accuracy = 0.83667
Iter6MiniBatch Loss =133.507889, Training Accuracy = 0.80667
Iter7MiniBatch Loss =114.443909, Training Accuracy = 0.86333
Iter8MiniBatch Loss =103.080223, Training Accuracy = 0.86333
Iter9MiniBatch Loss =99.932602, Training Accuracy = 0.90000
Iter10MiniBatch Loss =93.207428, Training Accuracy = 0.86000
Iter11MiniBatch Loss =67.471329, Training Accuracy = 0.93000
Iter12MiniBatch Loss =62.449608, Training Accuracy = 0.92333
Iter13MiniBatch Loss =50.676277, Training Accuracy = 0.93000
Iter14MiniBatch Loss =55.832417, Training Accuracy = 0.92333
Iter15MiniBatch Loss =44.194443, Training Accuracy = 0.96333
Iter16MiniBatch Loss =30.585236, Training Accuracy = 0.95667
Iter17MiniBatch Loss =48.206429, Training Accuracy = 0.94333
Optimization Finished!
复制代码
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值