# [Python人工智能] 五.theano实现神经网络正规化Regularization处理

"莫烦大神" 网易云视频地址：http://study.163.com/provider/1111519/course.html

## 二. 定义Layer类及增加数据集

1.定义Layer类

L1 = Layer(inputs, in_size=13, out_size=50, activation_function)
参数包括输入值，输入节点数，输出节点数和激励函数
L2 = Layer(L1.outputs, 50, 1, None)
参数中L1的输出作为输入值，L1的输出10个节点作为输入节点，输出节点1个，激励函数为None。

#coding:utf-8
import numpy as np
import theano.tensor as T
import theano
from theano import function
import matplotlib.pyplot as plt

#首先定义神经网络Layer类
class Layer(object):
def __init__(self, inputs, in_size, out_size, activation_function=None):
#权重: 平均值为0 方差为1 行数为in_size  列数为out_size
self.W = theano.shared(np.random.normal(0,1,(in_size,out_size)))
#bias
self.b = theano.shared(np.zeros((out_size,) ) + 0.1)
#乘法加bias
self.Wx_plus_b = T.dot(inputs, self.W) + self.b #dot乘法
#激励函数
self.activation_function = activation_function

#默认为None,否则进行激活
if activation_function is None:
self.outputs = self.Wx_plus_b
else:
self.outputs = self.activation_function(self.Wx_plus_b)



2.增加数据集

#coding:utf-8
import numpy as np
import theano.tensor as T
import theano
from theano import function
import matplotlib.pyplot as plt

#首先定义神经网络Layer类
class Layer(object):
def __init__(self, inputs, in_size, out_size, activation_function=None):
#权重: 平均值为0 方差为1 行数为in_size  列数为out_size
self.W = theano.shared(np.random.normal(0,1,(in_size,out_size)))
#bias
self.b = theano.shared(np.zeros((out_size,) ) + 0.1)
#乘法加bias
self.Wx_plus_b = T.dot(inputs, self.W) + self.b #dot乘法
#激励函数
self.activation_function = activation_function

#默认为None,否则进行激活
if activation_function is None:
self.outputs = self.Wx_plus_b
else:
self.outputs = self.activation_function(self.Wx_plus_b)

#正常化处理 数据降为0-1之间
def minmax_normalization(data):
xs_max = np.max(data, axis=0)
xs_min = np.min(data, axis=0)
xs = (1-0) * (data - xs_min) / (xs_max - xs_min) + 0
return xs

#导入sklearn中的波士顿房价数据集
#500多个数据点 每个sample有13个特征去描述房价
np.random.seed(100)

#minmax normalization, rescale the inputs
x_data = minmax_normalization(x_data)
print(x_data)
#增加一个维度 定义成矩阵的形式
print(y_data)

#cross validation, train test data split
#划分训练集和测试集
#前400个sameple或样本行作为训练集, 剩余的作为预测集
x_train, y_train = x_data[:400], y_data[:400]
x_test, y_test = x_data[400:], y_data[400:]
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

[[0.00000000e+00 1.80000000e-01 6.78152493e-02 ... 2.87234043e-01
1.00000000e+00 8.96799117e-02]
[2.35922539e-04 0.00000000e+00 2.42302053e-01 ... 5.53191489e-01
1.00000000e+00 2.04470199e-01]
[2.35697744e-04 0.00000000e+00 2.42302053e-01 ... 5.53191489e-01
9.89737254e-01 6.34657837e-02]
...
[6.11892474e-04 0.00000000e+00 4.20454545e-01 ... 8.93617021e-01
1.00000000e+00 1.07891832e-01]
[1.16072990e-03 0.00000000e+00 4.20454545e-01 ... 8.93617021e-01
9.91300620e-01 1.31070640e-01]
[4.61841693e-04 0.00000000e+00 4.20454545e-01 ... 8.93617021e-01
1.00000000e+00 1.69701987e-01]]
[[24. ]
[21.6]
[34.7]
[33.4]
[36.2]
...
[16.8]
[22.4]
[20.6]
[23.9]
[22. ]
[11.9]]
(400, 13) (400, 1)
(106, 13) (106, 1)

## 三. theano实现回归神经网络正规化

1.定义变量和Layer

L1: 13个属性，神经层有50个神经元，激活函数用tanh
L1 = Layer(x, 13, 50, T.tanh)
L2: 输入为L1输出，输入个数为50，输出为1即房价
L2 = Layer(L1.outputs, 50, 1, None)

#coding:utf-8
import numpy as np
import theano.tensor as T
import theano
from theano import function
import matplotlib.pyplot as plt

#首先定义神经网络Layer类
class Layer(object):
def __init__(self, inputs, in_size, out_size, activation_function=None):
#权重: 平均值为0 方差为1 行数为in_size  列数为out_size
self.W = theano.shared(np.random.normal(0,1,(in_size,out_size)))
#bias
self.b = theano.shared(np.zeros((out_size,) ) + 0.1)
#乘法加bias
self.Wx_plus_b = T.dot(inputs, self.W) + self.b #dot乘法
#激励函数
self.activation_function = activation_function

#默认为None,否则进行激活
if activation_function is None:
self.outputs = self.Wx_plus_b
else:
self.outputs = self.activation_function(self.Wx_plus_b)

#正常化处理 数据降为0-1之间
def minmax_normalization(data):
xs_max = np.max(data, axis=0)
xs_min = np.min(data, axis=0)
xs = (1-0) * (data - xs_min) / (xs_max - xs_min) + 0
return xs

#导入sklearn中的波士顿房价数据集
#500多个数据点 每个sample有13个特征去描述房价
np.random.seed(100)

#minmax normalization, rescale the inputs
x_data = minmax_normalization(x_data)
print(x_data)
#增加一个维度 定义成矩阵的形式
print(y_data)

#cross validation, train test data split
#划分训练集和测试集
#前400个sameple或样本行作为训练集, 剩余的作为预测集
x_train, y_train = x_data[:400], y_data[:400]
x_test, y_test = x_data[400:], y_data[400:]
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

#定义x和y
x = T.dmatrix("x")
y = T.dmatrix("y")

#定义两个Layer
#L1: 13个属性，神经层有50个神经元，激活函数用tanh
L1 = Layer(x, 13, 50, T.tanh)
#L2: 输入为L1输出，输入个数为50，输出为1即房价
L2 = Layer(L1.outputs, 50, 1, None)

2.计算误差

（1）普通方法

cost = T.mean(T.square(L2.outputs-y))

（2）L2 Regularization
cost = T.mean(T.square(L2.outputs-y)) + 0.1*((L1.W**2).sum() + (L2.W**2).sum())

（3）L1 Regularization
cost = T.mean(T.square(L2.outputs-y)) + 0.1*(abs(L1.W).sum() + abs(L2.W).sum())

#coding:utf-8
import numpy as np
import theano.tensor as T
import theano
from theano import function
import matplotlib.pyplot as plt

#首先定义神经网络Layer类
class Layer(object):
def __init__(self, inputs, in_size, out_size, activation_function=None):
#权重: 平均值为0 方差为1 行数为in_size  列数为out_size
self.W = theano.shared(np.random.normal(0,1,(in_size,out_size)))
#bias
self.b = theano.shared(np.zeros((out_size,) ) + 0.1)
#乘法加bias
self.Wx_plus_b = T.dot(inputs, self.W) + self.b #dot乘法
#激励函数
self.activation_function = activation_function

#默认为None,否则进行激活
if activation_function is None:
self.outputs = self.Wx_plus_b
else:
self.outputs = self.activation_function(self.Wx_plus_b)

#正常化处理 数据降为0-1之间
def minmax_normalization(data):
xs_max = np.max(data, axis=0)
xs_min = np.min(data, axis=0)
xs = (1-0) * (data - xs_min) / (xs_max - xs_min) + 0
return xs

#导入sklearn中的波士顿房价数据集
#500多个数据点 每个sample有13个特征去描述房价
np.random.seed(100)

#minmax normalization, rescale the inputs
x_data = minmax_normalization(x_data)
print(x_data)
#增加一个维度 定义成矩阵的形式
print(y_data)

#cross validation, train test data split
#划分训练集和测试集
#前400个sameple或样本行作为训练集, 剩余的作为预测集
x_train, y_train = x_data[:400], y_data[:400]
x_test, y_test = x_data[400:], y_data[400:]
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

#定义x和y
x = T.dmatrix("x")
y = T.dmatrix("y")

#定义两个Layer
#L1: 13个属性，神经层有50个神经元，激活函数用tanh
L1 = Layer(x, 13, 50, T.tanh)
#L2: 输入为L1输出，输入个数为50，输出为1即房价
L2 = Layer(L1.outputs, 50, 1, None)

#the way to compute cost
#计算误差 但该方法的结果会产生Overfitting问题
cost = T.mean(T.square(L2.outputs-y))

#L2 regularization
#0.1乘以L1的权重平方求和加上L2的权重平方和
#惩罚机制: 快要进入Overfitting时，通过这个机制来惩罚不进入Overfitting
cost = T.mean(T.square(L2.outputs-y)) + 0.1*((L1.W**2).sum() + (L2.W**2).sum())

#L1 regularization
cost = T.mean(T.square(L2.outputs-y)) + 0.1*(abs(L1.W).sum() + abs(L2.W).sum())


3.梯度下降更新

L1.W, L1.W-learnging_rate*gW1：
(原始的权重-学习效率*下降幅度)并且更新为L1.W，通过该方法将L1.W、L1.b、L2.W、L2.b更新。

#coding:utf-8
import numpy as np
import theano.tensor as T
import theano
from theano import function
import matplotlib.pyplot as plt

#首先定义神经网络Layer类
class Layer(object):
def __init__(self, inputs, in_size, out_size, activation_function=None):
#权重: 平均值为0 方差为1 行数为in_size  列数为out_size
self.W = theano.shared(np.random.normal(0,1,(in_size,out_size)))
#bias
self.b = theano.shared(np.zeros((out_size,) ) + 0.1)
#乘法加bias
self.Wx_plus_b = T.dot(inputs, self.W) + self.b #dot乘法
#激励函数
self.activation_function = activation_function

#默认为None,否则进行激活
if activation_function is None:
self.outputs = self.Wx_plus_b
else:
self.outputs = self.activation_function(self.Wx_plus_b)

#正常化处理 数据降为0-1之间
def minmax_normalization(data):
xs_max = np.max(data, axis=0)
xs_min = np.min(data, axis=0)
xs = (1-0) * (data - xs_min) / (xs_max - xs_min) + 0
return xs

#导入sklearn中的波士顿房价数据集
#500多个数据点 每个sample有13个特征去描述房价
np.random.seed(100)

#minmax normalization, rescale the inputs
x_data = minmax_normalization(x_data)
print(x_data)
#增加一个维度 定义成矩阵的形式
#print(y_data)

#cross validation, train test data split
#划分训练集和测试集
#前400个sameple或样本行作为训练集, 剩余的作为预测集
x_train, y_train = x_data[:400], y_data[:400]
x_test, y_test = x_data[400:], y_data[400:]
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

#定义x和y
x = T.dmatrix("x")
y = T.dmatrix("y")

#定义两个Layer
#L1: 13个属性，神经层有50个神经元，激活函数用tanh
L1 = Layer(x, 13, 50, T.tanh)
#L2: 输入为L1输出，输入个数为50，输出为1即房价
L2 = Layer(L1.outputs, 50, 1, None)

#the way to compute cost
#计算误差 但该方法的结果会产生Overfitting问题
cost = T.mean(T.square(L2.outputs-y))

#L2 regularization
#0.1乘以L1的权重平方求和加上L2的权重平方和
#惩罚机制: 快要进入Overfitting时，通过这个机制来惩罚不进入Overfitting
cost = T.mean(T.square(L2.outputs-y)) + 0.1*((L1.W**2).sum() + (L2.W**2).sum())

#L1 regularization
cost = T.mean(T.square(L2.outputs-y)) + 0.1*(abs(L1.W).sum() + abs(L2.W).sum())

#对比正规化和没有正规化的区别
#梯度下降定义
gW1, gb1, gW2, gb2 = T.grad(cost, [L1.W, L1.b, L2.W, L2.b])

#学习率
learning_rate = 0.01

train = theano.function(
inputs=[x,y],
updates=[(L1.W, L1.W - learning_rate * gW1),
(L1.b, L1.b - learning_rate * gb1),
(L2.W, L2.W - learning_rate * gW2),
(L2.b, L2.b - learning_rate * gb2)])

#计算误差
compute_cost = theano.function(inputs=[x,y], outputs=cost)
print(compute_cost)

4.预测结果

#coding:utf-8
import numpy as np
import theano.tensor as T
import theano
from theano import function
import matplotlib.pyplot as plt

#首先定义神经网络Layer类
class Layer(object):
def __init__(self, inputs, in_size, out_size, activation_function=None):
#权重: 平均值为0 方差为1 行数为in_size  列数为out_size
self.W = theano.shared(np.random.normal(0,1,(in_size,out_size)))
#bias
self.b = theano.shared(np.zeros((out_size,) ) + 0.1)
#乘法加bias
self.Wx_plus_b = T.dot(inputs, self.W) + self.b #dot乘法
#激励函数
self.activation_function = activation_function

#默认为None,否则进行激活
if activation_function is None:
self.outputs = self.Wx_plus_b
else:
self.outputs = self.activation_function(self.Wx_plus_b)

#正常化处理 数据降为0-1之间
def minmax_normalization(data):
xs_max = np.max(data, axis=0)
xs_min = np.min(data, axis=0)
xs = (1-0) * (data - xs_min) / (xs_max - xs_min) + 0
return xs

#导入sklearn中的波士顿房价数据集
#500多个数据点 每个sample有13个特征去描述房价
np.random.seed(100)

#minmax normalization, rescale the inputs
x_data = minmax_normalization(x_data)
print(x_data)
#增加一个维度 定义成矩阵的形式
#print(y_data)

#cross validation, train test data split
#划分训练集和测试集
#前400个sameple或样本行作为训练集, 剩余的作为预测集
x_train, y_train = x_data[:400], y_data[:400]
x_test, y_test = x_data[400:], y_data[400:]
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

#定义x和y
x = T.dmatrix("x")
y = T.dmatrix("y")

#定义两个Layer
#L1: 13个属性，神经层有50个神经元，激活函数用tanh
L1 = Layer(x, 13, 50, T.tanh)
#L2: 输入为L1输出，输入个数为50，输出为1即房价
L2 = Layer(L1.outputs, 50, 1, None)

#the way to compute cost
#计算误差 但该方法的结果会产生Overfitting问题
cost = T.mean(T.square(L2.outputs-y))

#L2 regularization
#0.1乘以L1的权重平方求和加上L2的权重平方和
#惩罚机制: 快要进入Overfitting时，通过这个机制来惩罚不进入Overfitting
cost = T.mean(T.square(L2.outputs-y)) + 0.1*((L1.W**2).sum() + (L2.W**2).sum())

#L1 regularization
cost = T.mean(T.square(L2.outputs-y)) + 0.1*(abs(L1.W).sum() + abs(L2.W).sum())

#对比正规化和没有正规化的区别
#梯度下降定义
gW1, gb1, gW2, gb2 = T.grad(cost, [L1.W, L1.b, L2.W, L2.b])

#学习率
learning_rate = 0.01

train = theano.function(
inputs=[x,y],
updates=[(L1.W, L1.W - learning_rate * gW1),
(L1.b, L1.b - learning_rate * gb1),
(L2.W, L2.W - learning_rate * gW2),
(L2.b, L2.b - learning_rate * gb2)])

#计算误差
compute_cost = theano.function(inputs=[x,y], outputs=cost)
print(compute_cost)

#存储cost误差
train_err_list =[]
test_err_list = []
learning_time = [] #计算每一步的i

#训练1000次 每隔10次输出
for i in range(1000):
train(x_train, y_train)
if i % 10 == 0:
#训练误差
cost1 = compute_cost(x_train, y_train)
train_err_list.append(cost1)
#预测误差
cost2 = compute_cost(x_test, y_test)
test_err_list.append(cost2)
learning_time.append(i) #对应i
print(cost1)
print(cost2)
print(i)

76.95290841879309
64.23189302430346
0

50.777745719854
32.325523689775714
10

37.604371357212884
20.74023271455164
20
...

5.绘制图形对比

#coding:utf-8
import numpy as np
import theano.tensor as T
import theano
from theano import function
import matplotlib.pyplot as plt

#首先定义神经网络Layer类
class Layer(object):
def __init__(self, inputs, in_size, out_size, activation_function=None):
#权重: 平均值为0 方差为1 行数为in_size  列数为out_size
self.W = theano.shared(np.random.normal(0,1,(in_size,out_size)))
#bias
self.b = theano.shared(np.zeros((out_size,) ) + 0.1)
#乘法加bias
self.Wx_plus_b = T.dot(inputs, self.W) + self.b #dot乘法
#激励函数
self.activation_function = activation_function

#默认为None,否则进行激活
if activation_function is None:
self.outputs = self.Wx_plus_b
else:
self.outputs = self.activation_function(self.Wx_plus_b)

#正常化处理 数据降为0-1之间
def minmax_normalization(data):
xs_max = np.max(data, axis=0)
xs_min = np.min(data, axis=0)
xs = (1-0) * (data - xs_min) / (xs_max - xs_min) + 0
return xs

#导入sklearn中的波士顿房价数据集
#500多个数据点 每个sample有13个特征去描述房价
np.random.seed(100)

#minmax normalization, rescale the inputs
x_data = minmax_normalization(x_data)
print(x_data)
#增加一个维度 定义成矩阵的形式
#print(y_data)

#cross validation, train test data split
#划分训练集和测试集
#前400个sameple或样本行作为训练集, 剩余的作为预测集
x_train, y_train = x_data[:400], y_data[:400]
x_test, y_test = x_data[400:], y_data[400:]
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

#定义x和y
x = T.dmatrix("x")
y = T.dmatrix("y")

#定义两个Layer
#L1: 13个属性，神经层有50个神经元，激活函数用tanh
L1 = Layer(x, 13, 50, T.tanh)
#L2: 输入为L1输出，输入个数为50，输出为1即房价
L2 = Layer(L1.outputs, 50, 1, None)

#the way to compute cost
#计算误差 但该方法的结果会产生Overfitting问题
cost = T.mean(T.square(L2.outputs-y))

#L2 regularization
#0.1乘以L1的权重平方求和加上L2的权重平方和
#惩罚机制: 快要进入Overfitting时，通过这个机制来惩罚不进入Overfitting
#cost = T.mean(T.square(L2.outputs-y)) + 0.1*((L1.W**2).sum() + (L2.W**2).sum())

#L1 regularization
#cost = T.mean(T.square(L2.outputs-y)) + 0.1*(abs(L1.W).sum() + abs(L2.W).sum())

#对比正规化和没有正规化的区别
#梯度下降定义
gW1, gb1, gW2, gb2 = T.grad(cost, [L1.W, L1.b, L2.W, L2.b])

#学习率
learning_rate = 0.01

train = theano.function(
inputs=[x,y],
updates=[(L1.W, L1.W - learning_rate * gW1),
(L1.b, L1.b - learning_rate * gb1),
(L2.W, L2.W - learning_rate * gW2),
(L2.b, L2.b - learning_rate * gb2)])

#计算误差
compute_cost = theano.function(inputs=[x,y], outputs=cost)
print(compute_cost)

#存储cost误差
train_err_list =[]
test_err_list = []
learning_time = [] #计算每一步的i

#训练1000次 每隔10次输出
for i in range(1000):
train(x_train, y_train)
if i % 10 == 0:
#训练误差
cost1 = compute_cost(x_train, y_train)
train_err_list.append(cost1)
#预测误差
cost2 = compute_cost(x_test, y_test)
test_err_list.append(cost2)
learning_time.append(i) #对应i
print(cost1)
print(cost2)
print(i)

#plot cost history
plt.plot(learning_time, train_err_list, 'r-') #红色线为训练误差
plt.plot(learning_time, test_err_list, 'b--') #蓝色虚线为测试结果
plt.show()
（1）Overfitting问题对应曲线，红色线为训练误差，蓝色虚线为测试结果，会发现预测的误差在不断变大。
cost = T.mean(T.square(L2.outputs-y))

（2）L2 Regularization，通过正规化处理后的结果，发现预测结果和训练结果的误差变化基本一致，其效果更好。
cost = T.mean(T.square(L2.outputs-y)) + 0.1*((L1.W**2).sum() + (L2.W**2).sum())

（3）L1 regularization输出结果如下图所示：
cost = T.mean(T.square(L2.outputs-y)) + 0.1*(abs(L1.W).sum() + abs(L2.W).sum())

(By:Eastmount 2018-06-01 下午5点  http://blog.csdn.net/eastmount/ )

• 广告
• 抄袭
• 版权
• 政治
• 色情
• 无意义
• 其他

120