github: github.com/yangjinghit…
import pandas as pd
import numpy as np
复制代码
data = pd.read_csv('Tweets.csv')
复制代码
data.head(2)
复制代码
| tweet_id | airline_sentiment | airline_sentiment_confidence | negativereason | negativereason_confidence | airline | airline_sentiment_gold | name | negativereason_gold | retweet_count | text | tweet_coord | tweet_created | tweet_location | user_timezone |
---|
0 | 570306133677760513 | neutral | 1.0000 | NaN | NaN | Virgin America | NaN | cairdin | NaN | 0 | @VirginAmerica What @dhepburn said. | NaN | 2015-02-24 11:35:52 -0800 | NaN | Eastern Time (US & Canada) |
---|
1 | 570301130888122368 | positive | 0.3486 | NaN | 0.0 | Virgin America | NaN | jnardino | NaN | 0 | @VirginAmerica plus you've added commercials t... | NaN | 2015-02-24 11:15:59 -0800 | NaN | Pacific Time (US & Canada) |
---|
data = data[['airline_sentiment', 'text']]
复制代码
with open('twee', 'a', encoding = 'utf-8') as f:
for string in data.text:
f.writelines(string+'\n')
复制代码
from gensim.models import word2vec
复制代码
sentences = word2vec.Text8Corpus("twee")
model = word2vec.Word2Vec(sentences, size=300)
复制代码
word_vectors = model.wv
del model
复制代码
data['vec'] = data.text.apply(lambda x : [word_vectors[w] for w in x.split() if w in word_vectors])
复制代码
data = data[data['vec'].apply(lambda x : len(x)>5)]
复制代码
data.head(3)
复制代码
| airline_sentiment | text | vec |
---|
1 | positive | @VirginAmerica plus you've added commercials t... | [[2.2402475, 0.15890086, -0.082046695, 0.80472... |
---|
2 | neutral | @VirginAmerica I didn't today... Must mean I n... | [[2.2402475, 0.15890086, -0.082046695, 0.80472... |
---|
3 | negative | @VirginAmerica it's really aggressive to blast... | [[2.2402475, 0.15890086, -0.082046695, 0.80472... |
---|
del data['text']
复制代码
data.airline_sentiment.unique()
复制代码
array(['positive', 'neutral', 'negative'], dtype=object)
复制代码
data.airline_sentiment.value_counts()
复制代码
negative 9007
neutral 2789
positive 2013
Name: airline_sentiment, dtype: int64
复制代码
dic = {'neutral':np.array([1,0,0]), 'positive':np.array([0,1,0]), 'negative':np.array([0,0,1])}
复制代码
data['cat'] = data.airline_sentiment.map(dic)
复制代码
del data['airline_sentiment']
复制代码
data.columns
复制代码
Index(['vec', 'cat'], dtype='object')
复制代码
data = data.reset_index()
del data['index']
复制代码
maxlength = max(len(x) for x in data.vec)
复制代码
maxlength
复制代码
36
复制代码
data.head(2)
复制代码
| vec | cat |
---|
0 | [[2.2402475, 0.15890086, -0.082046695, 0.80472... | [0, 1, 0] |
---|
1 | [[2.2402475, 0.15890086, -0.082046695, 0.80472... | [1, 0, 0] |
---|
def pad(x):
xl = np.zeros((maxlength, 300))
xl[:len(x)] = x
return xl
复制代码
dataset = data.vec.apply(pad)
复制代码
dataset.head(2)
复制代码
0 [[2.2402474880218506, 0.15890085697174072, -0....
1 [[2.2402474880218506, 0.15890085697174072, -0....
Name: vec, dtype: object
复制代码
len(dataset)
复制代码
13809
复制代码
labels = np.concatenate(data.cat).reshape(len(data.cat), -1)
复制代码
np.shape(labels)
复制代码
(13809, 3)
复制代码
data_ = np.concatenate(dataset).reshape(len(dataset), maxlength, 300)
复制代码
np.shape(data_)
复制代码
(13809, 36, 300)
复制代码
index = np.random.permutation(int(len(data)))
复制代码
label = labels[index]
dataset = data_[index]
复制代码
label_train = label[:12000]
dataset_train = dataset[:12000]
label_test = label[12000:]
dataset_test = dataset[12000:]
复制代码
import tensorflow as tf
复制代码
/anaconda3/envs/py35/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5
return f(*args, **kwds)
/anaconda3/envs/py35/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
from ._conv import register_converters as _register_converters
复制代码
learning_rate = 0.005
batch_size = 300
n_input = 300
n_steps = maxlength
n_hidden = 128
n_classes = 3
复制代码
x = tf.placeholder(tf.float32, [None, n_steps, n_input])
y = tf.placeholder(tf.float32, [None, n_classes])
output_keep_prob = tf.placeholder("float")
复制代码
reg = tf.contrib.layers.l2_regularizer(scale=0.01)
复制代码
def length(shuju):
return tf.reduce_sum(tf.sign(tf.reduce_max(tf.abs(shuju),reduction_indices=2)), reduction_indices=1)
复制代码
cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(n_hidden,
kernel_initializer = tf.truncated_normal_initializer(stddev= 0.0001),
bias_initializer = tf.truncated_normal_initializer(stddev=0.0001)),
output_keep_prob = output_keep_prob)
复制代码
output, _ = tf.nn.dynamic_rnn(
cell,
x,
dtype=tf.float32,
sequence_length= length(x))
复制代码
output.get_shape()
复制代码
TensorShape([Dimension(None), Dimension(36), Dimension(128)])
复制代码
index = tf.range(0, batch_size)*n_steps + (tf.cast(length(x), tf.int32) -1)
flat = tf.reshape(output, [-1, int(output.get_shape()[2])])
last = tf.gather(flat, index)
复制代码
fc_1 = tf.contrib.layers.fully_connected(
last,
64,
weights_initializer = tf.truncated_normal_initializer(stddev=0.01),
activation_fn = tf.nn.relu)
keep_prob = tf.placeholder("float")
fc1_drop = tf.nn.dropout(fc_1, keep_prob)
复制代码
weight = tf.Variable(tf.truncated_normal([64, n_classes],stddev=0.001))
bias = tf.Variable(tf.constant(0.1, shape=[n_classes]))
prediction = tf.nn.softmax(tf.matmul(fc1_drop, weight) + bias)
复制代码
cross_entropy = -tf.reduce_sum(y * tf.log(prediction))
复制代码
weights = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES)
复制代码
tf.contrib.layers.apply_regularization(reg, weights_list=weights)
复制代码
<tf.Tensor 'get_regularization_penalty:0' shape=() dtype=float32>
复制代码
reg_ws = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
复制代码
optimizer = tf.train.AdamOptimizer(learning_rate,beta1=0.9)
grads = optimizer.compute_gradients(cross_entropy + tf.reduce_sum(reg_ws))
for i, (g,v) in enumerate(grads):
if g is not None:
grads[i] = (tf.clip_by_norm(g, 5), v)
train_op = optimizer.apply_gradients(grads)
复制代码
/anaconda3/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py:97: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
"Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
WARNING:tensorflow:From /anaconda3/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/clip_ops.py:110: calling reduce_sum (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
复制代码
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
复制代码
def generatebatch(X,Y, n_examples, batch_size):
for batch_i in range(n_examples // batch_size):
start = batch_i*batch_size
end = start + batch_size
batch_xs = X[start:end]
batch_ys = Y[start:end]
yield batch_xs, batch_ys
复制代码
sess = tf.Session()
复制代码
init = tf.global_variables_initializer()
sess.run(init)
复制代码
saver = tf.train.Saver()
复制代码
for step in range(18):
index_= np.random.permutation(int(len(dataset_train)))
dataset_train = dataset_train[index_]
label_train = label_train[index_]
for batch_x, batch_y in generatebatch(dataset_train, label_train, len(label_train), batch_size):
sess.run(train_op, feed_dict={x:batch_x, y:batch_y, keep_prob:0.5, output_keep_prob:0.5})
acc = sess.run(accuracy, feed_dict={x:batch_x, y:batch_y, keep_prob:1, output_keep_prob:1})
loss = sess.run(cross_entropy, feed_dict={x:batch_x, y:batch_y, keep_prob:1, output_keep_prob:1})
saver.save(sess, './lesson0', global_step=step)
print("Iter" + str(step) + "MiniBatch Loss =" + "{:.6f}".format(loss) + ", Training Accuracy = " + "{:.5f}".format(acc))
print("Optimization Finished!")
复制代码
Iter0MiniBatch Loss =214.256958, Training Accuracy = 0.66667
Iter1MiniBatch Loss =173.106171, Training Accuracy = 0.76333
Iter2MiniBatch Loss =163.925598, Training Accuracy = 0.80333
Iter3MiniBatch Loss =158.836716, Training Accuracy = 0.77667
Iter4MiniBatch Loss =155.008820, Training Accuracy = 0.79667
Iter5MiniBatch Loss =131.040298, Training Accuracy = 0.83667
Iter6MiniBatch Loss =133.507889, Training Accuracy = 0.80667
Iter7MiniBatch Loss =114.443909, Training Accuracy = 0.86333
Iter8MiniBatch Loss =103.080223, Training Accuracy = 0.86333
Iter9MiniBatch Loss =99.932602, Training Accuracy = 0.90000
Iter10MiniBatch Loss =93.207428, Training Accuracy = 0.86000
Iter11MiniBatch Loss =67.471329, Training Accuracy = 0.93000
Iter12MiniBatch Loss =62.449608, Training Accuracy = 0.92333
Iter13MiniBatch Loss =50.676277, Training Accuracy = 0.93000
Iter14MiniBatch Loss =55.832417, Training Accuracy = 0.92333
Iter15MiniBatch Loss =44.194443, Training Accuracy = 0.96333
Iter16MiniBatch Loss =30.585236, Training Accuracy = 0.95667
Iter17MiniBatch Loss =48.206429, Training Accuracy = 0.94333
Optimization Finished!
复制代码