# 读取数据with open('reviews.txt', 'r') as f:
reviews = f.read()
with open('labels.txt', 'r') as f:
labels = f.read()
# 每一个 \n 表示一条review
reviews[:2000]
'bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life such as teachers . my years in the teaching profession lead me to believe that bromwell high s satire is much closer to reality than is teachers . the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn t \nstory of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turned into an insane violent mob by the crazy chantings of it s singers . unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting . even those from the era should be turned off . the cryptic dialogue would make shakespeare seem easy to a third grader . on a technical level it s better than you might think with some good cinematography by future great vilmos zsigmond . future stars sally kirkland and frederic forrest can be seen briefly . \nhomelessness or houselessness as george carlin stated has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school work or vote for the matter . most people think of the homeless as just a lost cause while worrying about things such as racism the war on iraq pressuring kids to succeed technology the elections inflation or worrying if they ll be next to end up on the streets . br br but what if y'
from string import punctuation
# 去除标点符号
all_text = ''.join([c for c in reviews if c notin punctuation])
# 每一个\n表示一条review
reviews = all_text.split('\n')
all_text = ' '.join(reviews)
# 获得所有单词
words = all_text.split()
all_text[:2000]
'bromwell high is a cartoon comedy it ran at the same time as some other programs about school life such as teachers my years in the teaching profession lead me to believe that bromwell high s satire is much closer to reality than is teachers the scramble to survive financially the insightful students who can see right through their pathetic teachers pomp the pettiness of the whole situation all remind me of the schools i knew and their students when i saw the episode in which a student repeatedly tried to burn down the school i immediately recalled at high a classic line inspector i m here to sack one of your teachers student welcome to bromwell high i expect that many adults of my age think that bromwell high is far fetched what a pity that it isn t story of a man who has unnatural feelings for a pig starts out with a opening scene that is a terrific example of absurd comedy a formal orchestra audience is turned into an insane violent mob by the crazy chantings of it s singers unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting even those from the era should be turned off the cryptic dialogue would make shakespeare seem easy to a third grader on a technical level it s better than you might think with some good cinematography by future great vilmos zsigmond future stars sally kirkland and frederic forrest can be seen briefly homelessness or houselessness as george carlin stated has been an issue for years but never a plan to help those on the street that were once considered human who did everything from going to school work or vote for the matter most people think of the homeless as just a lost cause while worrying about things such as racism the war on iraq pressuring kids to succeed technology the elections inflation or worrying if they ll be next to end up on the streets br br but what if you were given a bet to live on the st'
from collections import Counter
defget_vocab_to_int(words):# 统计每个单词出现的次数
counts = Counter(words)
# 按出现次数,从多到少排序
vocab = sorted(counts, key=counts.get, reverse=True)
# 建立单词到数字的映射,也就是给单词贴上一个数字下标,在网络中用数字标签表示单词# 例如,'apple'在网络中就是一个数字,比如是500.# 数字标签从 1 开始, 0 作特殊作用(下面会说)
vocab_to_int = { word : i for i, word in enumerate(vocab, 1)}
return vocab_to_int
defget_reviews_ints(vocab_to_int, reviews):# 将review转换为数字,也就是将review中每个单词,通过vocab_to_int转换为数字# 例如,"I love this moive" 可能被转换为 [5 36 45 12354]
reviews_ints = []
for each in reviews:
reviews_ints.append( [ vocab_to_int[word] for word in each.split()] )
return reviews_ints
labels = np.array([0if label=='negative'else1for label in labels.split('\n')])
Step 4 清理垃圾数据
出于不知名的原因,在reviews_ints中居然有长度为0的数据存在,这是无意义的数据,进行清除
同时,最长的review有2514个单词,这对于我们网络而言实在是太长了,要砍掉一部分
review_lens = Counter([len(x) for x in reviews_ints])
print('Zero-length reviews:{}'.format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))
Zero-length reviews:1
Maximum review length: 2514
# 获取长度不为0的review的下标
non_zeros_idx = [ ii for ii, review in enumerate(reviews_ints) if len(review) != 0]
len(non_zeros_idx)
25000
# 将长度为0的review从reviews_ints中清除
reviews_ints = [ reviews_ints[ii] for ii in non_zeros_idx]
labels = np.array( [ labels[ii] for ii in non_zeros_idx] )
defget_batches(x, y, batch_size=100):
n_batches = len(x) // batch_size
x, y = x[:n_batches*batch_size], y[:n_batches*batch_size]
for ii in range(0, len(x), batch_size):
yield x[ii:ii+batch_size], y[ii:ii+batch_size]
训练
epochs = 10# 持久化,保存训练的模型with graph.as_default():
saver = tf.train.Saver()
with tf.Session(graph=graph) as sess:
tf.global_variables_initializer().run()
iteration = 1for e in range(epochs):
state = sess.run(initial_state)
for ii, (x, y) in enumerate(get_batches(train_x, train_y, batch_size), 1):
feed = {inputs_ : x,
labels_ : y[:,None],
keep_prob : 0.5,
initial_state : state}
loss, state, _ = sess.run([cost, final_state, optimizer], feed_dict=feed)
if iteration % 5 == 0:
print('Epoch: {}/{}'.format(e, epochs),
'Iteration: {}'.format(iteration),
'Train loss: {}'.format(loss))
if iteration % 25 == 0:
val_acc = []
val_state = sess.run(cell.zero_state(batch_size, tf.float32))
for x, y in get_batches(val_x, val_y, batch_size):
feed = {inputs_ : x,
labels_ : y[:,None],
keep_prob : 1,
initial_state : val_state}
batch_acc, val_state = sess.run([accuracy, final_state], feed_dict=feed)
val_acc.append(batch_acc)
print('Val acc: {:.3f}'.format(np.mean(val_acc)))
iteration += 1
saver.save(sess, "checkpoints/sentiment.ckpt")