import pandas as pd
import re
import numpy as np
import os
复制代码

from gensim.models import word2vec
复制代码

data_t = pd.read_csv('labeledTrainData.tsv',sep='\t')



复制代码

data_t.shape
复制代码

(25000, 3)
复制代码

if not os.path.exists('mymodel'):
    if not os.path.exists('imdb_text'):
        data_un = pd.read_csv('unlabeledTrainData.tsv',header=0, delimiter="\t",quoting=3 )
        pat = re.compile(r'[A-Za-z]+')
        with open('imdb_text','a',encoding = 'utf-8') as f:
            for rev in data_un.review:
                str_list = pat.findall(rev)
                str_list = [x.lower() for x in str_list]
                string = ' '.join(str_list)
                f.write(string + '\n')
            del data_un
    sentences =word2vec.Text8Corpus("imdb_text")  # 加载语料  
    model =word2vec.Word2Vec(sentences, size=50)  #训练skip-gram模型，默认window=5 
    model.save('mymodel') 
else:
    model = word2vec.Word2Vec.load('mymodel')

word_vectors = model.wv
del model
复制代码

word_vectors

复制代码

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x1a24968da0>
复制代码

data_t['vec'] = data_t.review.apply(lambda x :[word_vectors[w] for w in x.split() if w in word_vectors])
复制代码

del data_t['review']
del word_vectors


复制代码

import gc
gc.collect()
复制代码

14
复制代码

data_t = data_t[data_t['vec'].apply(lambda x:len(x)>0)]
data_t.sentiment.value_counts()
复制代码

0    12499
1    12495
Name: sentiment, dtype: int64
复制代码

maxlength = max([len(x) for x in data_t.vec])
maxlength
复制代码

1622
复制代码

sum(data_t.vec.apply(len)>300)

复制代码

3246
复制代码

def pad(x):
    if len(x)>300:
        x1 = x[:300]
    else:
        x1 = np.zeros((300,50))
        x1[:len(x)] = x
    return x1


复制代码

data_t['vec'] = data_t.vec.apply(pad)
复制代码

import tensorflow as tf
复制代码

/anaconda3/envs/py35/lib/python3.5/importlib/_bootstrap.py:222: RuntimeWarning: compiletime version 3.6 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.5
  return f(*args, **kwds)
/anaconda3/envs/py35/lib/python3.5/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
  from ._conv import register_converters as _register_converters
复制代码

learning_rate = 0.002
batch_size = 100
n_input = 50
n_steps = 300
n_hidden = 300
n_classes = 2


复制代码


x = tf.placeholder(tf.float32, [None, n_steps,n_input])
y = tf.placeholder(tf.int64, [None])
keep_prob = tf.placeholder("float")



复制代码

def length(shuru):
    return tf.reduce_sum(tf.sign(tf.reduce_max(tf.abs(shuru),reduction_indices=2)),reduction_indices=1)


复制代码

cell = tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.GRUCell(n_hidden),
                output_keep_prob = keep_prob)
复制代码

output, _ = tf.nn.dynamic_rnn(
            cell,
            x,
            dtype=tf.float32,
            sequence_length = length(x)
        )
复制代码

output.get_shape()
复制代码

TensorShape([Dimension(None), Dimension(300), Dimension(300)])
复制代码

index = tf.range(0,batch_size)*n_steps + (tf.cast(length(x),tf.int32) - 1)
flat = tf.reshape(output,[-1,int(output.get_shape()[2])])
last = tf.gather(flat,index)
复制代码

weight = tf.Variable(tf.truncated_normal((n_hidden, n_classes), stddev=0.001))
bias = tf.Variable(tf.constant(0.1, shape=[n_classes]))
com_out = tf.matmul(last, weight) + bias
prediction = tf.nn.softmax(com_out)
复制代码

cross_entropy = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels = y, logits = com_out))


复制代码

optimizer = tf.train.AdamOptimizer(learning_rate)
grads = optimizer.compute_gradients(cross_entropy)
for i, (g, v) in enumerate(grads):
    if g is not None:
        grads[i] = (tf.clip_by_norm(g, 5), v)  # clip gradients
train_op = optimizer.apply_gradients(grads)


复制代码

/anaconda3/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/gradients_impl.py:97: UserWarning: Converting sparse IndexedSlices to a dense Tensor of unknown shape. This may consume a large amount of memory.
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


WARNING:tensorflow:From /anaconda3/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/clip_ops.py:110: calling reduce_sum (from tensorflow.python.ops.math_ops) with keep_dims is deprecated and will be removed in a future version.
Instructions for updating:
keep_dims is deprecated, use keepdims instead
复制代码

correct_pred = tf.equal(tf.argmax(prediction,1), y)
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))


复制代码

def generatebatch(X,Y,n_examples, batch_size):
    for batch_i in range(n_examples // batch_size):
        start = batch_i*batch_size
        end = start + batch_size
        batch_xs = X[start:end]
        batch_ys = Y[start:end]
        yield batch_xs, batch_ys # 生成每一个batch


复制代码

sess = tf.Session()
init = tf.global_variables_initializer() 
sess.run(init)
saver = tf.train.Saver()


复制代码

for step in range(10):
    index = np.random.permutation(int(len(data_t.vec.values)))
    for batch_x,batch_y in generatebatch(data_t.vec.values[index],data_t.sentiment.values[index],len(data_t.vec.values),batch_size): 
        batch_x = np.concatenate(batch_x).reshape(batch_size,300,50)
        batch_x.astype(np.float32)
        sess.run(train_op, feed_dict={x: batch_x, y: batch_y,keep_prob: 0.5})
    acc = sess.run(accuracy, feed_dict={x: batch_x, y: batch_y,keep_prob: 1})
    loss = sess.run(cross_entropy, feed_dict={x: batch_x, y: batch_y,keep_prob: 1})
    saver.save(sess,'./lesson0',global_step = step)
    print("Iter " + str(step) + ", Minibatch Loss= " +  "{}".format(loss) + ", Training Accuracy= " +  "{}".format(acc))
print("Optimization Finished!")
复制代码

Iter 0, Minibatch Loss= 0.3504045009613037, Training Accuracy= 0.8799999952316284
Iter 1, Minibatch Loss= 0.2799288034439087, Training Accuracy= 0.8899999856948853
Iter 2, Minibatch Loss= 0.25252586603164673, Training Accuracy= 0.8700000047683716
Iter 3, Minibatch Loss= 0.2636661231517792, Training Accuracy= 0.9300000071525574
复制代码