[Watermelon_book] Chapter 5 Neural Network

最新推荐文章于 2021-07-29 15:55:43 发布

thu_apple

最新推荐文章于 2021-07-29 15:55:43 发布

阅读量281

点赞数

本文链接：https://blog.csdn.net/thutx/article/details/90758611

版权

Perceptron

Perceptron的理论知识

在这里插入图片描述

Perceptron 的实际编码

Task

Code for perceptron

Model (Wrong)

This version is wrong because of the training process

import numpy as np

class Perceptron():
    
    def __init__(self, num_of_input, lr=0.01):
        self.weights = np.zeros(num_of_input + 1)
        self.learning_rate = lr
        
    def predict(self,inputs):
        summation = np.dot(self.weights[1:], inputs) + self.weights[0]
        if summation >= 0:
            predition = 1
        else:
            predition = 0
        return predition
    
    def train(self, data_feature, data_label, epoch = 100):
        #data_feature: list
        #data_label: numpy array
        while (epoch > 0):
            print("epoch is %s"%epoch)
            epoch -= 1
            for sample, label in zip(data_feature,data_label):
                print("sample is ",sample)
                print("prediction", self.predict(sample),"label is", label)
                self.weights[1:] += self.learning_rate*(label - self.predict(sample)) * sample
                # Here is the reason
                # I shouldn't have writen in this way
                # I should update all weights simultaneously rather than one by one
                print("weights are", self.weights[1:])
                self.weights[0] +=  self.learning_rate*(label - self.predict(sample))
                print("bias is", self.weights[0])

# Test of AND function

training_inputs = []
training_inputs.append(np.array([1, 1]))
training_inputs.append(np.array([1, 0]))
training_inputs.append(np.array([0, 1]))
training_inputs.append(np.array([0, 0]))
labels = np.array([1, 0, 0, 0])

preceptron = Perceptron(2)
preceptron.train(training_inputs, labels, epoch = 2)

epoch is 2
sample is  [1 1]
prediction 1 label is 1
weights are [0. 0.]
bias is 0.0
sample is  [1 0]
prediction 1 label is 0
weights are [-0.01  0.  ]
bias is 0.0
sample is  [0 1]
prediction 1 label is 0
weights are [-0.01 -0.01]
bias is 0.0
sample is  [0 0]
prediction 1 label is 0
weights are [-0.01 -0.01]
bias is -0.01
epoch is 1
sample is  [1 1]
prediction 0 label is 1
weights are [0. 0.]
bias is 0.0
sample is  [1 0]
prediction 1 label is 0
weights are [-0.01  0.  ]
bias is 0.0
sample is  [0 1]
prediction 1 label is 0
weights are [-0.01 -0.01]
bias is 0.0
sample is  [0 0]
prediction 1 label is 0
weights are [-0.01 -0.01]
bias is -0.01

Model (Right)

import numpy as np

class Perceptron():
    
    def __init__(self, num_of_input, lr=0.01):
        self.weights = np.zeros(num_of_input + 1)
        self.learning_rate = lr
        
    def predict(self,inputs):
        summation = np.dot(self.weights[1:], inputs) + self.weights[0]
        if summation >= 0:
            predition = 1
        else:
            predition = 0
        return predition
    
    def train(self, data_feature, data_label, epoch = 100):
        #data_feature: list
        #data_label: numpy array
        while (epoch > 0):
            epoch -= 1
            for sample, label in zip(data_feature,data_label):
                predition = self.predict(sample)
                self.weights[1:] += self.learning_rate*(label - predition) * sample
                self.weights[0] +=  self.learning_rate*(label - predition)

Test of AND function

training_inputs = []
training_inputs.append(np.array([1, 1]))
training_inputs.append(np.array([1, 0]))
training_inputs.append(np.array([0, 1]))
training_inputs.append(np.array([0, 0]))
labels = np.array([1, 0, 0, 0])

perceptron = Perceptron(2)
perceptron.train(training_inputs, labels, epoch = 10)

perceptron.weights

array([-0.03,  0.01,  0.02])

test_s = np.array([1, 1])
perceptron.predict(test_s)

多层感知器与BP算法

BP算法的理论知识

在这里插入图片描述

神经网络的实际编码

Task (Toy version)

Build a feedforward neural network and apply it on recognizing handwritten digits.

Just use Sum of Squares as cost function

${\delta^{L}=\nabla_{a} C \odot \sigma^{\prime}\left(z^{L}\right)} \\ {\delta^{l}=\left(\left(w^{l+1}\right)^{T} \delta^{l+1}\right) \odot \sigma^{\prime}\left(z^{l}\right)} \\ {\frac{\partial C}{\partial b_{j}^{l}}=\delta_{j}^{l}} \\ {\frac{\partial C}{\partial w_{j k}^{l}}=a_{k}^{l-1} \delta_{j}^{l}}$

import numpy as np
import random
class Network():
    
    def __init__(self, sizes):
        """
        sizes: list represents the pattern of the network
        
        Input layers don't need bias.
        """
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y,1) for i in sizes[1:]]
        self.weights = [np.random.randn(y,x) for x, y in zip(sizes[:-1], sizes[1:])]
    
    def train(self, training_data, epochs, mini_batch_size, lr, test_data=None):
        """
        train_data: a list of tuple(features, desired_output)
        test_data: if provided, model will show the performance on test data every epoch
        """
        n = len(training_data)
        for i in range(epochs):
            random.shuffle(training_data)
            mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                self.updata_mini_batch(mini_batch, lr)
            if test_data:
                print("Epoch:\t", i, "\t\t", self.evaluate(test_data))
            else:
                print("Epoch", i, "completed.")
    
    def updata_mini_batch(self, mini_batch,lr):
        delta_b = [np.zeros(b.shape) for b in self.biases]
        delta_w = [np.zeros(w.shape) for w in self.weights]
        for x,y in mini_batch:
            delta_b_tmp, delta_w_tmp = self.backprop(x, y)
            delta_b = [db+dbt for db, dbt in zip(delta_b, delta_b_tmp)]
            delta_w = [dw+dwt for dw, dwt in zip(delta_w, delta_w_tmp)]
            
        self.weights = [w - lr*dw/len(mini_batch) for w, dw in zip(self.weights, delta_w)]
        self.biases = [b - lr*db/len(mini_batch) for b, db in zip(self.biases, delta_b)]
            
    
    def feedback(self, a):
        for w, b in zip(self.weights, self.biases):
            a = sigmoid(np.dot(w,a) + b)
        return a
    
    def backprop(self, x, y):
        # First, we need to get the output of the whole network.
        activation = x
        axtivations = [x]
        zs = []
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w,activation) + b
            zs.append(z)
            activation = sigmoid(z)
            activations.append(activation)
        # then we need to know eta of each neuron
        eta = cost_derivative(activation[-1], y) * sigmoid_derivative(zs[-1])
        delta_b = [np.zeros(b.shape) for b in self.biases]
        delta_w = [np.zeros(w.shape) for w in self.weights]
        delta_b[-1] = eta
        delta_w[-1] = np.dot(eta, activation[-2].transpose())
        # backward
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = sigmoid_derivative(z)
            eta = np.dot(self.weights[-l+1].transpose(), eta) * sp
            delta_b[-l] = eta
            delta_w[-l] = np.dot(eta, activations[-l-1].transpose())
        return (delta_b, delta_w)   
    
    def evaluate(self, test_data):
        pass
        
    def cost_function(self,output, desired_output):
        return np.square(np.sum(output - desired_output))/2
    
    def cost_derivative(self, output, desired_output):
        return (output - desired_output)
        
    def sigmoid(z):
        return 1.0/(1.0 + np.exp(-z))
    
    def sigmoid_derivative(z):
        return sigmoid(z)*(1-sigmoid(z))

I just use PASS to fill in the “evaluate” FUNCTION bacause defining desired output for neural network is another topic.

Meanwhile it’s worth to give a summary on gradient descent methods.

3 Primary Types of Gradient Descent

From Here.

With gradient descent, you start with a cost function (also known as a loss or error function) based on a set of parameters. The goal is to find the parameter values that minimize the cost function. The process starts by guessing some initial parameter values. Then you iteratively change the parameter values in such a way so as to reduce the cost function. Hopefully, the process ends with a minimum.

The commonly used analogy is hiking down a hill from an initial starting point, while choosing a direction to advance using small steps along the way toward a minimum point. The gradient descent process uses the derivatives of the cost function to follow the function downhill to a minimum. The figure below illustrates the step-by-step gradient descent process.

在这里插入图片描述

Batch Gradient Descent

Batch Gradient Descent is the most straightforward type. It calculates the error for each example within the training set. After it evaluates all training examples, it updates the model parameters. This process is often referred to as a training epoch.

Advantages of batch gradient descent are that it’s computationally efficient and produces a stable error gradient and a stable convergence.

One disadvantage is that the stable error gradient can sometimes result in a state of convergence that isn’t the best the model can achieve. It also requires that the entire training set resides in memory and is available to the algorithm.

Stochastic Gradient Descent

Stochastic Gradient Descent updates the parameters according to the gradient of the error with respect to a single training example. This is unlike Batch Gradient Descent, which updates the parameters after all training examples have been evaluated.

This can make Stochastic Gradient Descent faster than Batch Gradient Descent depending on the problem. One advantage is that the frequent updates provide a detailed rate of improvement.

A disadvantage is that the frequent updates are more computationally expensive than Batch Gradient Descent. The frequency of the updates also can result in noisy gradients, and may cause the error rate to fluctuate instead of slowly decrease.

Mini Batch Gradient Descent

Mini Batch Gradient Descent is an often-preferred method since it uses a combination of Stochastic Gradient Descent and Batch Gradient Descent. It simply separates the training set into small batches and performs an update for each of these batches.

Common numbers of examples per batch range between 30 and 500. But like for any other machine learning technique, there is no well-defined rule because the optimal number can vary for different problems. Mini Batch Gradient Descent is commonly used for deep learning problems.

Here I chose Mini Batch Gradient Descent.

Desired Output

简而言之，就是对于一个多分类问题，desired output究竟是什么？比如一个三分类问题，似乎一个比较自然的方式是三个输出，只有一位置1来表示每个类别。但似乎也可以用两个神经元作为输出层，通过编码实现desired output。我直接选用了前者作为desired output的输出方式了。似乎这种one-hot编码的方式是相对比较常用的方式，这个博客写的相对具体。

Modified Code

import numpy as np
import random
class Network():
    
    def __init__(self, sizes):
        """
        sizes: list represents the pattern of the network
        
        Input layers don't need bias.
        """
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y,1) for y in sizes[1:]]
        self.weights = [np.random.randn(y,x) for x, y in zip(sizes[:-1], sizes[1:])]
    
    def train(self, training_data, epochs, mini_batch_size, lr, test_data=None):
        """
        train_data: a list of tuple(features, desired_output)
        test_data: if provided, model will show the performance on test data every epoch
        """
        n = len(training_data)
        for i in range(epochs):
            random.shuffle(training_data)
            mini_batches = [training_data[k:k+mini_batch_size] for k in range(0, n, mini_batch_size)]
            for mini_batch in mini_batches:
                self.updata_mini_batch(mini_batch, lr)
            if test_data:
                print("Epoch:\t", i, "\t\t", self.evaluate(test_data))
            else:
                print("Epoch", i, "completed.")
    
    def updata_mini_batch(self, mini_batch,lr):
        delta_b = [np.zeros(b.shape) for b in self.biases]
        delta_w = [np.zeros(w.shape) for w in self.weights]
        for x,y in mini_batch:
            delta_b_tmp, delta_w_tmp = self.backprop(x, y)
            delta_b = [db+dbt for db, dbt in zip(delta_b, delta_b_tmp)]
            delta_w = [dw+dwt for dw, dwt in zip(delta_w, delta_w_tmp)]
            
        self.weights = [w - lr*dw/len(mini_batch) for w, dw in zip(self.weights, delta_w)]
        self.biases = [b - lr*db/len(mini_batch) for b, db in zip(self.biases, delta_b)]
            
    
    def feedback(self, a):
        for w, b in zip(self.weights, self.biases):
            a = self.sigmoid(np.dot(w,a) + b)
        return a
    
    def backprop(self, x, y):
        # First, we need to get the output of the whole network.
        activation = x
        activations = [x]
        zs = []
        for b, w in zip(self.biases, self.weights):
            z = np.dot(w,activation) + b
            zs.append(z)
            activation = self.sigmoid(z)
            activations.append(activation)
        # then we need to know eta of each neuron
        eta = self.cost_derivative(activations[-1], y) * self.sigmoid_derivative(zs[-1])
        delta_b = [np.zeros(b.shape) for b in self.biases]
        delta_w = [np.zeros(w.shape) for w in self.weights]
        delta_b[-1] = eta
        delta_w[-1] = np.dot(eta, activations[-2].transpose())
        # backward
        for l in range(2, self.num_layers):
            z = zs[-l]
            sp = self.sigmoid_derivative(z)
            eta = np.dot(self.weights[-l+1].transpose(), eta) * sp
            delta_b[-l] = eta
            delta_w[-l] = np.dot(eta, activations[-l-1].transpose())
        return (delta_b, delta_w)   
    
    def evaluate(self, test_data):
        results = [(np.argmax(self.feedback(x)),y) for (x, y) in test_data]
        acc = np.sum([int(x)==int(y) for (x,y) in results]) / len(test_data)
        return acc
        
    def cost_function(self,output, desired_output):
        return np.square(np.sum(output - desired_output))/2
    
    def cost_derivative(self, output, desired_output):
        return (output - desired_output)
        
    def sigmoid(self, z):
        return 1.0/(1.0 + np.exp(-z))
    
    def sigmoid_derivative(self, z):
        return self.sigmoid(z)*(1-self.sigmoid(z))

Load the data

import pickle
import gzip

mnist_data_file = "./mnist.pkl.gz"
f = gzip.open(mnist_data_file,"rb")
u = pickle._Unpickler(f)
u.encoding = 'latin1'
training_data, validation_data, test_data = u.load()
f.close()

type(training_data)

tuple

len(training_data)

training_data[0].shape

(50000, 784)

validation_data[0].shape

(10000, 784)

test_data[0].shape

(10000, 784)

Initialize the data

def change_label(y):
    e = np.zeros((10,1))
    e[y] = 1.0
    return e
feature_size = len(training_data[0][0])

# training_data
training_feature = [np.reshape(i,(feature_size, 1)) for i in training_data[0]]
training_label = [change_label(i) for i in training_data[1]]
training_useful = list(zip(training_feature, training_label))

# validation_data
validation_feature = [np.reshape(i,(feature_size, 1)) for i in validation_data[0]]
validation_useful = list(zip(validation_feature, validation_data[1]))

# test_data
test_feature = [np.reshape(i,(feature_size, 1)) for i in test_data[0]]
test_useful = list(zip(test_feature, test_data[1]))

type(training_useful)

list

len(training_useful)

type(training_useful[0])

tuple

len(training_useful[0])

type(training_useful[0][0])

numpy.ndarray

training_useful[0][0].shape

(784, 1)

type(training_useful[0][1])

numpy.ndarray

training_useful[0][1].shape

(10, 1)

training_useful[0][1]

array([[0.],
       [0.],
       [0.],
       [0.],
       [0.],
       [1.],
       [0.],
       [0.],
       [0.],
       [0.]])

type(test_useful)

list

len(test_useful)

type(test_useful[0])

tuple

len(test_useful[0])

test_useful[0][0].shape

(784, 1)

test_useful[0][1]

Test Results

%%time
nn = Network([784, 30, 10])

nn.train(training_data=training_useful, epochs=3, mini_batch_size=10,lr=0.5, test_data=validation_useful)

Epoch:	 0 		 0.8324
Epoch:	 1 		 0.884
Epoch:	 2 		 0.9
CPU times: user 3min 17s, sys: 229 ms, total: 3min 17s
Wall time: 38.2 s

%%time
nn = Network([784, 30, 10])

nn.train(training_data=training_useful, epochs=20, mini_batch_size=10,lr=1, test_data=validation_useful)

Epoch:	 0 		 0.8724
Epoch:	 1 		 0.9059
Epoch:	 2 		 0.9159
Epoch:	 3 		 0.9226
Epoch:	 4 		 0.9279
Epoch:	 5 		 0.9306
Epoch:	 6 		 0.9314
Epoch:	 7 		 0.9358
Epoch:	 8 		 0.9379
Epoch:	 9 		 0.938
Epoch:	 10 		 0.94
Epoch:	 11 		 0.9413
Epoch:	 12 		 0.9422
Epoch:	 13 		 0.9418
Epoch:	 14 		 0.9443
Epoch:	 15 		 0.9433
Epoch:	 16 		 0.9466
Epoch:	 17 		 0.9453
Epoch:	 18 		 0.9456
Epoch:	 19 		 0.9463
CPU times: user 44min 30s, sys: 5.62 s, total: 44min 36s
Wall time: 10min 45s

We can see, just using such a naive NN can get a good result.