Projet 2
- 在Project 1 中,我们学习了如何处理和统计reviews
- 基于上一个Project”一个萝卜一个坑”的思想,我们将文本进行了数字化,使得文本可以作为输入进行训练
- 在Project 2中我们将构建一个两层的神经网络,并进行训练
import numpy as np
import sys
import time
import pandas as pd
# 读取数据
reviews = pd.read_csv('reviews.txt', header=None)
labels = pd.read_csv('labels.txt', header=None)
构建神经网络
- 构建一个两层的神经网络
- 注意:输入层到隐层不使用任何激活函数,只要隐层到输出层才使用激活函数(为什么这么做?后面的Project会提到)
class SentimentNetwork(object):
def __init__(self, reviews, labels, hidden_nodes=10, learning_rate = 0.1):
"""
参数:
reviews(dataFrame), 用于训练
labels(dataFrame), 用于训练
hidden_nodes(int), 隐层的个数
learning_rate(double),学习步长
"""
np.random.seed(1)
self.pre_process_data(reviews, labels)
self.init_network(len(self.review_vocab), hidden_nodes, 1, learning_rate)
def pre_process_data(self, reviews, labels):
"""
预处理数据,统计reviews中出现的所有单词,并且生成word2index
"""
# 统计reviews中出现的所有单词,
review_vocab = set()
for review in reviews.values:
word = review[0].split(' ')
review_vocab.update(word)
self.review_vocab = list(review_vocab)
# 统计labels中所有出现的label(其实在这里,就+1和-1两种)
label_vocab = set()
for label in labels.values:
label_vocab.add(label[0])
self.label_vocab = list(label_vocab)
# 构建word2idx,给每个单词安排一个"门牌号"
self.word2idx = dict()
for idx, word in enumerate(self.review_vocab):
self.word2idx[word] = idx
def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
"""
初始化网络的参数
"""
self.learning_rate = learning_rate
self.input_nodes = input_nodes
self.hidden_nodes = hidden_nodes
self.output_nodes = output_nodes
self.weights_0_1 = np.random.normal( 0.0, self.input_nodes**-0.5, (self.input_nodes, self.hidden_nodes) )
self.weights_1_2 = np.random.normal( 0.0, self.hidden_nodes**-0.5, (self.hidden_nodes, self.output_nodes) )
self.layer_0 = np.zeros((1, self.input_nodes))
def update_input_layer(self, review):
"""
对review进行数字化处理,并将结果存放到self.layer_0中,也就是输入层
"""
self.layer_0 *= 0
for word in review.split(' '):
if word.lower() in self.word2idx:
idx = self.word2idx[word.lower()]
# 统计单词出现的次数,作为输入
self.layer_0[0,idx] += 1
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def sigmoid_output_2_derivative(self, output):
return output * (1 - output)
def get_target_for_label(self,label):
if label == 'positive':
return 1
else:
return 0
def train(self, training_reviews, training_label):
assert(len(training_reviews) == len(training_label))
correct_so_far = 0
start = time.time()
# 进行训练
for i in range(len(training_reviews)):
review = training_reviews.iloc[i,0]
label = training_label.iloc[i,0]
self.update_input_layer(review)
layer_1_i = np.dot( self.layer_0, self.weights_0_1 )
layer_1_o = layer_1_i
layer_2_i = np.dot( layer_1_o, self.weights_1_2 )
layer_2_o = self.sigmoid( layer_2_i )
layer_2_error = layer_2_o - self.get_target_for_label(label)
layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2_o)
layer_1_error = np.dot( layer_2_delta, self.weights_1_2.T )
layer_1_delta = layer_1_error
# 权重更新
self.weights_1_2 -= np.dot(layer_1_o.T, layer_2_delta) * self.learning_rate
self.weights_0_1 -= np.dot(self.layer_0.T, layer_1_delta) * self.learning_rate
if(layer_2_o >= 0.5 and label=='positive'):
correct_so_far += 1
elif(layer_2_o < 0.5 and label=='negative'):
correct_so_far += 1
elapsed_time = float(time.time() - start)
reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
+ "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
+ " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
+ " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
if(i % 2500 == 0):
print("")
def test(self, testing_reviews, testing_labels):
assert(len(testing_reviews) == len(testing_labels))
correct = 0
start = time.time()
for i in range(len(testing_reviews)):
review = testing_reviews.iloc[i,0]
label = testing_labels.iloc[i,0]
pred = self.run(review)
if pred == label:
correct += 1
elapsed_time = float(time.time() - start)
reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
+ "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
+ " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
+ " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
def run(self, review):
self.update_input_layer(review)
layer_1_i = np.dot( self.layer_0, self.weights_0_1 )
layer_1_o = layer_1_i
layer_2_i = np.dot( layer_1_o, self.weights_1_2 )
layer_2_o = self.sigmoid( layer_2_i )
if layer_2_o >= 0.5:
return 'positive'
else:
return 'negative'
mlp = SentimentNetwork(reviews, labels)
mlp.train(reviews[:-1000], labels[:-1000])
Progress:0.0% Speed(reviews/sec):0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:10.4% Speed(reviews/sec):101.5 #Correct:1246 #Trained:2501 Training Accuracy:49.8%
Progress:20.8% Speed(reviews/sec):103.1 #Correct:2496 #Trained:5001 Training Accuracy:49.9%
Progress:31.2% Speed(reviews/sec):103.4 #Correct:3746 #Trained:7501 Training Accuracy:49.9%
Progress:41.6% Speed(reviews/sec):101.3 #Correct:4996 #Trained:10001 Training Accuracy:49.9%
Progress:52.0% Speed(reviews/sec):99.30 #Correct:6246 #Trained:12501 Training Accuracy:49.9%
Progress:62.5% Speed(reviews/sec):98.82 #Correct:7496 #Trained:15001 Training Accuracy:49.9%
Progress:72.9% Speed(reviews/sec):98.57 #Correct:8746 #Trained:17501 Training Accuracy:49.9%
Progress:83.3% Speed(reviews/sec):98.86 #Correct:9996 #Trained:20001 Training Accuracy:49.9%
Progress:93.7% Speed(reviews/sec):99.27 #Correct:11246 #Trained:22501 Training Accuracy:49.9%
Progress:99.9% Speed(reviews/sec):99.44 #Correct:11995 #Trained:24000 Training Accuracy:49.9%
- 从输出的训练信息来看,始终在50%徘徊,也就是说根本没有进展,对于这种情况,我们首先要做的就是试一试调小learning_rate
mlp = SentimentNetwork(reviews, labels, learning_rate=0.01)
mlp.train(reviews[:-1000], labels[:-1000])
Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:10.4% Speed(reviews/sec):103.6 #Correct:1329 #Trained:2501 Training Accuracy:53.1%
Progress:20.8% Speed(reviews/sec):103.5 #Correct:2923 #Trained:5001 Training Accuracy:58.4%
Progress:31.2% Speed(reviews/sec):99.96 #Correct:4593 #Trained:7501 Training Accuracy:61.2%
Progress:41.6% Speed(reviews/sec):97.92 #Correct:6341 #Trained:10001 Training Accuracy:63.4%
Progress:52.0% Speed(reviews/sec):97.22 #Correct:8105 #Trained:12501 Training Accuracy:64.8%
Progress:62.5% Speed(reviews/sec):97.64 #Correct:9889 #Trained:15001 Training Accuracy:65.9%
Progress:72.9% Speed(reviews/sec):97.91 #Correct:11671 #Trained:17501 Training Accuracy:66.6%
Progress:83.3% Speed(reviews/sec):97.92 #Correct:13530 #Trained:20001 Training Accuracy:67.6%
Progress:93.7% Speed(reviews/sec):97.92 #Correct:15384 #Trained:22501 Training Accuracy:68.3%
Progress:99.9% Speed(reviews/sec):97.82 #Correct:16531 #Trained:24000 Training Accuracy:68.8%
mlp.test(reviews[-1000:], labels[-1000:])
Progress:99.9% Speed(reviews/sec):861.2 #Correct:747 #Tested:1000 Testing Accuracy:74.7%
End Project 2
- 通过训练,我们的神经网络能达到60%以上的准确率,至少比猜的结果要好,不错,这是一个好的开始。
- 准确率只有70多,可提升的空间还有很大,在下一个Project,我们将进行分析如何提高准确率