import numpy as np
import pandas as pd
def affine_forward(x,w,b):
'''
Inputs:
:param x: A numpy array containing input data, of shape(N,D
:param w: A numpy array containing weights, of shape(D, M 权重
:param b: A numpy array containing baises, of shape( M,) 偏置
:return:
out: output, of shape(N, M)
cache:(x, w, b)
'''
out = None #初始化
# print(x.shape) 100*3072
# reshaped_x = np.reshape(x, (x.shape[0], -1))
out = x.dot(w) + b
cache = (x, w, b)
return out, cache
def relu_forward(x):
out = np.maximum(0, x) #取x中的每个元素与0作比较
cache = x #缓冲进来的x 矩阵
return out, cache
def affine_relu_forward(x, w, b):
a, fc_cache = affine_forward(x, w, b) #线性模型
out, relu_cache = relu_forward(a) #激活函数
cache = (fc_cache, relu_cache) #缓冲的是元组 (x, w, b ,(a))
return out, cache
def batch_forward(x, gamma, beta, bn_param):
mode = bn_param['mode']
eps = bn_param.get('eps',1e-5)
momentum = bn_param.get('momentum', 0.9)
N, D =x.shape
running_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype))
running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype))
out, cache = None, None
if mode == 'train':
sample_mean = np.mean(x, axis=0) #矩阵每一列的平局值 (D,)
sample_var = np.var(x, axis=0) #矩阵每一列的方差 (D,)
x_hat = (x-sample_mean)/(np.sqart(sample_var + eps))
out = gamma * x_hat+ beta
cache = (x, sample_mean, sample_var, x_hat, eps, gamma, beta)
running_mean = momentum * running_mean + (1 - momentum) * sample_mean
running_var = momentum * running_var + (1- momentum) * sample_var
elif mode =='test':
out = (x - running_mean)*gamma/(np.sqrt(running_var + eps)) + beta
else:
raise ValueError('Invalid forward batchnorm mode "%s' %mode)
#store the updated running means bach into bn_param
bn_param['running_mean'] = running_mean
bn_param['running_var'] = running_var
return out, cache
def affine_bn_relu_forward(x, w, b, gamma, beta, bn_param):
'''
Inputs:
:param x: array of shape(N,D1),input to the affine layer
:param w: array of shape (D2,D2) giving the weight and bias for the affine transform
:param b: (D2,)
:param gamma: array of shape (D2,) giving scaler and shift parameters for batch normalization
:param beta: (D2,)
:param bn_param: dictionary of parameters for batcha normalization.
:return:
out: out from RELU, of shape (N,D2)
cache: object to give to the backward pass.
'''
a, fc_cache = affine_forward(x,w,b)
a_bn, bn_cache = batch_forward(a,gamma,beta,bn_param)
out, relu_cache = relu_forward(a_bn)
cache = (fc_cache, bn_param, relu_cache)
return out, cache
def batchnorm_backward(dout, cache):
'''
:param dout: upstream derivative, of shape(N,D
:param cache: Variable of intermediates fros batch_forward.
:return: tuple
dx gradient with repect to input x, of shape shape(N,D
dgamma: gradient with repect to scale parameter gamma, of shape shape(D,)
dbeta :gradient with repect to shift parameter beta, of shape shape(D,)
'''
x, mean, var, x_hat, eps, gamma, beta = cache
N = x.shape[0]
dgamma = np.sum(dout * x_hat, axis=0) #第5行公式
dbeta = np.sum(dout * 1.0, axis=0) #第6行公式
dx_hat = dout * gamma
dx_hat_numerator = dx_hat/ np.sqrt(var + eps)
dx_hat_denominator = np.sum(dx_hat * (x - mean), axis=0)
dx_1 = dx_hat_numerator
dvar = -0.5 * ((var + eps ) **(-1.5) * dx_hat_denominator)
#note var is also a function of mean
dmean = -1.0 * np.sum(dx_hat_numerator, axis=0) + \
dvar * np.mean(-2.0 * (x - mean), axis=0)
dx_var = dvar * 2.0 /N *(x - mean)
dx_mean = dmean * 1.0 / N
# with shape(D,) ,no trouble with broadcast
dx = dx_1 + dx_var + dx_mean
return dx, dgamma, dbeta
def dropout_forward(x, dropout_param):
p, mode =dropout_param['p'],dropout_param['mode']
if 'seed' in dropout_param:
np.random.seed(dropout_param['seed'])
mask = None
out = None
#训练模式
if mode == 'train':
keep_prob = 1- p
mask = (np.random.rand(*x.shape) < keep_prob) / keep_prob
out = mask * x
#测试模式
elif mode == 'test':
out = x
cache = (dropout_param, mask)
out = out.astype(x.dtype, copy=False)
return out, cache
def affine_bn_relu_backward(dout, cache):
'''
backward pass for the affine-batchnorm-relu convenience layer
:param dout:
:param cache:
:return:
'''
fc_cache, bn_cache, relu_cache = cache
da_bn = relu_backward(dout, relu_cache) #relu层
da, dgamma, dbeta = batchnorm_backward(da_bn, bn_cache) # Bn层,反向传播时在relu之后
dx, dw, db = affine_backward(da,fc_cache)
return dx, dw, db, dgamma, dbeta
def dropout_backward(dout, cache):
dropout_param, mask = cache
mode = dropout_param['mode']
dx = None
if mode =='train':
dx = mask * dout
elif mode == 'test':
dx = dout
return dx
def softmax_loss(z, y):
'''
:param z: input data, of shape (N ,C) where z[i , j] is the score for
tht jth class for the ith input
:param y: vector of labels, of shape (N,) where y[i] is the label for
x[i] and 0<=y[i]<=C
:return: tuple
loss: scalar giving the loss
dz: gradient of the loss with respect to z
'''
probs = np.exp(z - np.max (z, axis=1, keepdims=True))
probs /= np.sum(probs, axis=1, keepdims=True)
N = z.shape[0]
loss = -np.sum(np.log(probs[np.arange(N),y])) / N
dz = probs.copy() # 表示获得变量probs的副本
dz[np.arange(N), y] -= 1
dz /= N
return loss, dz
def affine_backward(dout, cache):
'''
computes the backward pass for an affine layer
:param dout: upstream derivative, of shape(N,M) 上一层的散度输出
:param cache: tuple of :
z: input data, of shape(N, d_1, d_2, d_3, d_4... d_k)
w: weights, of shape(D,M)
b: biases, of shape(M,)
:return:
dz: gradient with repect to z, of shape shape(N, d_1, d_2, d_3, d_4... d_k)
dw: gradient with repect to w, of shape shape(D, M)
db: gradient with repect to b, of shape shape( M,)
out = x.dot(w) + b
n,m = n,d d,m + m
'''
z, w, b = cache
dz, dw, db = None, None, None
reshaped_x = np.reshape(z, [z.shape[0], -1])
print(dout.dot(w.T).shape)
dz = np.reshape(dout.dot(w.T), z.shape) # N,M * M,D = N ,D
dw = reshaped_x.T.dot(dout)
db = np.sum(dout, axis=0)
return dz, dw, db
def relu_backward(dout, cache):
'''
:param dout:
:param cache:
:return:
'''
dx, x = None, cache
dx = (x>0) * dout
#与所有x中元素为正的位置处,位置对应与dout矩阵的元素保留,其他都取0
return dx
def affine_relu_backward(dout, cache):
fc_cache, relu_cache = cache # fc_cache = (x, w, b)
da = relu_backward(dout, relu_cache) # da= (x>0)* relu_cache
dx, dw, db = affine_backward(da, fc_cache)
return dx, dw, db
# class TwoLayerNet (object): # 我们的2层全连接神经网络
# '''
# 首先,需要初始化我们的神经网络。
# 毕竟,数据从输入层第一次流入到神经网络里,我们的参数(W,B)不能为空
# 也不能都太大或太小,因为参数(W,B)的初始化相当重要的
# 对整个神经网络的训练影响巨大,但 如何proper的 初始化参数仍然没有定论
# 目前仍有很多paper在专门讨论论这个 话题
# '''
# def __init__(self
# , input_dim = 3*3*32 #每张样本图片的数据维度大小
# , hidden_dim = 100 #隐藏层的神经元个数
# , num_classes = 10 #样本图片的分类类别个数
# , weight_scale = 1e-3): #初始化参数的权重尺度(标准差
# '''
# 我们把需要学习的参数(W,B)都存在self.params字典中,
# 其中每个元素都是numpy.arrays
#
# '''
# self.param = {}
# # 3*32*32,100
# self.param['W1'] = weight_scale * np.random.randn(input_dim,
# hidden_dim)
# self.param['b1'] = weight_scale * np.random.randn((hidden_dim,))
# self.param['W2'] = weight_scale * np.random.randn(hidden_dim,
# num_classes)
# self.param['b2'] = weight_scale * np.random.randn((num_classes,))
#接下来,我们定义最后一个loss函数就可以完成神经网络的构造
# def loss(self, X, y ):
# '''
# 首先, 输入的数据X 是一个多维的array,shape为(n,3,32,32)
# y是与输入数据相对应的正确标签,shape为(N,)
# 我们的loss函数目标输出一个损失值loss和一个grads的字典,
# 其中存有loss关于隐藏层和输出层的参数(W,B)的梯度值
# :param X:
# :param y:
# :return:
# '''
# loss, grads = 0, {}
# #数据X在隐藏层和输出层的前向传播:
# h1_out, h1_cache = affine_relu_forward(X, self.param['W1'], self.param['b1'])
# scores, out_cache = affine_forward(h1_out
# , self.param['W2']
# , self.param['b2'])
# #输出层后,结合正确标签y得出损失值和其在输出层的梯度:
# loss, dout = softmax_loss(scores, y)
#
# #损失值loss的梯度在输出层和隐藏层的反向传播:
# dout, dw2, db2 = affine_backward(dout, out_cache)
# grads['W2'] = dw2, grads['b2'] = db2
# _, dw1, db1 = affine_relu_backward(dout, h1_cache)
# grads['W1'] = dw1, grads['b1'] = db1
#
# '''
# 可以看到图片样本的数据梯度dout只起到了带路的作用,
# 最终会舍弃掉,我们只要loss关于参数的梯度,
# 然后保存在grads字典中
# '''
# # loss += 0.5* self.reg * (np.sum(self.param['W1']**2) +
# # np.sum(self.param['W2']**2))
# # dW2 += self.reg * self.param['W2']
# # dW1 += self.reg * self.param['W1']
# return loss, grads
class FullyConnectNet(object):
'''
一个任意隐藏层数和神经元数的全连接神经网络,其中 RELU激活函数,softmax损失函数,
同时可选采用 dropout和 batch normalization(批量归一化。那么,对于一个L层神经网络
的框架是:
{affine - [batch norm] - relu - [dropout]} * (_L- 1) - affine - softmax
其中【batch norm】和【dropout】是可选非必须的,框架中{。。。}部分将会重复L-1
次,表示L-1 个隐藏层
与我们在上面的定义TwoLayerNet()类保持一致,所有待学习的参数都会在self.params字典中
并且最终都会被优化 Solver()类训练学习得到
'''
# 第一步是初始化我们的FullyConnectedNet()类:
def __init__(self,
hidden_dims, #一个列表,元素个数是隐藏层数,元素值是该层的隐藏神经元数
input_dim =3*32*32, #默认的输入神经元的个数是3072个,匹配cifar数据集
num_classes = 10,
dropout= 0, #默认不开启dropout
use_batchnorm= False, #默认不开启批量归一化
reg=0.0, #默认无L2正则化,取某scalar表示正则化强度
weight_scale=1e-2, #默认0.01,表示权重参数初始化的标准差
dtype=np.float64, #默认np.float64精度,要求所有的计算都在此精度下
seed=None): #默认无随机种子,若有会传递dropout层
#实例 instance 中 增加变量并赋予初值,以方便后面的loss()函数调用
self.ues_batchnorm = use_batchnorm
self.use_dropout = dropout > 0
self.reg = reg
self.num_layers = 1 + len(hidden_dims)
self.dtype = dtype
self.params = {} #空字典保存待学习的参数变量
#定义所有隐藏层的参数到字典 self.params 中
in_dim = input_dim
for i, h_dim in enumerate(hidden_dims): #eg: (0,h1) (1,h2)
self.params['W%d'%(i+1,)] = weight_scale * np.random.randn(input_dim, h_dim)
# print(self.params['W%d'%(i+1,)].shape)
self.params['b%d'%(i+1,)] = np.zeros((h_dim,))
if use_batchnorm:
self.params['gamma%d'%(i+1,)] = np.ones((h_dim,))
self.params['beta%d'%(i+1,)] = np.zeros((h_dim,))
in_dim = h_dim #将该隐藏层的列数传递给下一层的行数
# 定义输出层的参数到字典params 中
self.params['W%d'%(self.num_layers,)] = weight_scale * np.random.randn(hidden_dims[-1],
num_classes)
self.params['b%d'%self.num_layers] = np.zeros(num_classes)
'''
当开启dropout时,我们需要在每一个神经元层中传递一个相同的dropout 参数字典self.dropout_param
以保证每一层的神经元都知道失活概率P,和当前神经网络的模式状态mode)训练or测试'''
self.dropout_param = {} #dropout的参数字典
if self.use_dropout:
self.dropout_param = {'mode': 'train', 'p' :dropout}
if seed is not None:
self.dropout_param['seed'] = seed
'''
当开启批量归一化时,我们要定义一个BN算法的参数列表,
以用来跟中记录每一层的平局值和标准差。其中,第0个元素self.bn_params[0]
表示前向传播第1个BN层的参数,第一个元素 self.bn_params[1]表示前向传播第2
个BN 层的参数'''
self.bn_params = []
if self.ues_batchnorm:
self.bn_params = [{'mode': 'train'} for i in range(self.num_layers-1)]
#上面 self.bn_params 列表的元素个数 是hidden_layer 的个数
# 最后,调整所有的待学习神经网络参数 为 指定计算精度:
for k,v in self.params.items():
self.params[k] = v.astype(dtype)
#第二不是定义我们的损失函数
def loss(self,X, y=None):
'''
和TwoLayerNet一样:
首先,输入的数据X 是一个多维的array, shape为3*32*32
y是与输出数据对应的标签的 一个数组,shape为(N
#在训练模式下,
我们的loss 损失函数目标输出一个损失值loss 和一个grads 字典,
其中存有loss关于隐藏层和输出层的参数(W,B,gamma,beta)的梯度值
#在测试模式下,
我们的loss函数只需要给出输出层周的得分即可,也就是tensorflow的 评估pre_y
:param X:
:param y:
:return:
'''
# 把输入的数据预案矩阵X 的精度调整一下
X = X.astype(self.dtype)
# 根据正确标签y 是否为None 来调整模式是test 还是train
mode = 'test' if y is None else 'train'
'''
当确定了神经网络所处的模式状态后,
就可以设置dropout 的参数字典和BN算法的参数列表的mode了
因为他们在不同的模式下行为是不同的
'''
if self.dropout_param is not None:
self.dropout_param['mode'] = mode
if self.ues_batchnorm:
for bn_param in self.bn_params:
bn_param['mode'] = mode
scores = None
'''
%前向传播%
如果开启了dropout,我们需要将dropout的参数字典self.dropout_param
在每一个dropout层中传递。
如果开启了批量归一化,我们需要指定BN 算法的参数列表,self.bn_params[0]
对应前向传播第一层的参数, self。bn_params
'''
fc_mix_cache = {} #初始化每层前向传播的缓冲字典
if self.use_dropout: #若开启dropout,初始化对应的缓冲字典
dp_cache = {}
# 从第一个隐藏层开始循环每一个隐藏层,传递数据out, 保存每一层的缓冲cache
out = X
# print(X.shape)
# print(type(X))
for i in range(self.num_layers-1): # 在每一个hidden层中循环
w, b = self.params['W%d'%(i+1,)], self.params['b%d'%(i+1,)]
if self.ues_batchnorm:
gamma = self.params['gamma%d'%(i+1,)]
beta = self.params['beta%d'%(i+1,)]
out, fc_mix_cache[i] = affine_bn_relu_forward(out,w,b,gamma,beta,self.bn_params[i])
else:
out, fc_mix_cache[i] = affine_relu_forward(out, w, b)
if self.use_dropout:
out, dp_cache[i] = dropout_forward(out,self.dropout_param)
#定义最后的输出层
w = self.params['W%d'%(self.num_layers,)]
b = self.params['b%d'%(self.num_layers,)]
out, out_cache = affine_forward(out, w, b)
scores = out
'''
可以看到,上面对隐藏层的每次循环中,out变量实现了自我迭代更新;
fc_mix_cache 缓冲字典中顺序的存储了每个隐藏层的得分情况和模型参数(其中可包含BN层参数);
dp——cache 缓冲字典中单独顺序的存储了每个dropout层的失活概率和遮罩mask;
out_cache 缓冲字典中 保存了输出层的信息'''
#接下来让loss函数区分不同的模式
if mode == 'test':
return scores
'''
反向传播
是在训练模式下,
计算损失值,通过反向传播,更新模型参数的梯度
'''
loss, grads = 0.0, {}
loss, dout = softmax_loss(scores, y)
loss += 0.5* self.reg * np.sum(self.params['W%d'%(self.num_layers,)]**2)
dout, dw, db = affine_backward(dout, out_cache)
grads['W%d'%(self.num_layers,)] = dw + self.reg * self.params['W%d'%(self.num_layers,)]
grads['b%d'%(self.num_layers,)] = db
#在每一个隐藏层处,梯度反向传播,不仅更新梯度字典grads,还迭代算出损失值loss
for i in range(self.num_layers-1):
ri = self.num_layers - 2 -i #倒数第ri+1 隐藏层
loss += 0.5* self.reg * np.sum(self.params['W%d'%(ri+1,)]**2)
if self.use_dropout:
dout = dropout_backward(dout,dp_cache[ri])
if self.ues_batchnorm:
dout, dw, db ,dgamma, dbeta = affine_bn_relu_backward(dout, fc_mix_cache[ri])
grads['gamma%d'%(ri+1,)] = dgamma
grads['beta%d'%(ri+1,)] = dbeta
else:
dout, dw, db = affine_relu_backward(dout, fc_mix_cache[ri])
grads['W%d' %(ri+1,)] = dw + self.reg * self.params['W%d' %(ri+1,)]
grads['b%d' %(ri+1,)] = db
return loss, grads
import optim
class Solver(object):
'''
我们定义的这个solver 类,将会根据我们的神经网络模型框架-FullyConnectedNet()类,
在数据源的训练集部分和验证集部分中,训练我们的模型,并且通过周期性的检查准曲率的方式,
以避免过拟合。
在这个类中,包括 __init__ () ,共定义5个函数,其中只有train()是最重要的。调用他后,
会自动启动神经网络模型优化程序。
训练结束后,经过更新在 验证集上优化后的模型参数会保存在model.params 中。此外,损失值的
历史训练信息会保存在 solve.loss_history中,还有solver.train_acc_history和solver.val_acc_history
中会分别保存训练集和验证集在每一次epoch时的模型准确率。
'''
#第一步 ,初始化我们的Solver()类
def __init__(self, model, data, **kwargs):
#实例中增加变量并赋予初值,以方便后面的train()函数等调用;
self.model = model
self.X_train = data['X_train'] #训练样本图片数据
self.y_train = data['y_train']
self.X_val, self.y_val = data['X_val'], data['y_val']
'''
以下是可选择输入的类参数,逐渐一个一个剪切打包kwargs 参数列表'''
self.update_rule = kwargs.pop('update_rule', 'sgd_momentum')
self.optim_confing = kwargs.pop('optim_config', {})
self.lr_decay = kwargs.pop('lr_decay', 1.0)
self.batch_size = kwargs.pop('batch_size', 100)
self.num_epochs = kwargs.pop('num_epochs', 10)
self.print_every = kwargs.pop('print_every', 10)
self.verbose = kwargs.pop('verbose', True)
'''
异常处理,如果kwargs参数列表中除了上述元素外还有其他的 就报错
'''
if len(kwargs) > 0 :
extra = ','.join('"%s"'% k for k in kwargs.keys())
raise ValueError("unrecognized arguments %s" %extra)
'''
异常处理:如果kwargs参数列表中没有优化算法,就报错!
将self.update_rule 转化为优化算法的函数,即
self.update_rule(w, dw, config) = (next_w, config)
'''
if not hasattr(optim, self.update_rule):
raise ValueError('Invalid update_rule "%s" '%self.update_rule)
self.update_rule = getattr(optim, self.update_rule)
#执行 _reset()函数
self._reset()
#第二部,定义我们 _reset()函数,其仅在类初始化函数 __init__()中调用
def _reset(self):
'''
重置一些用于记录优化的变量
:return:
'''
self.epoch = 0
self.best_val_acc =0
self.best_params = {}
self.loss_history = []
self.train_acc_history = []
self.val_acc_history = []
#make a deep copy of the optim_config for each parameter
self.optim_confing = {}
for p in self.model.params:
d = {k : v for k,v in self.optim_confing.items()}
self.optim_confing[p] = d
'''
上面根据模型中待学习的参数,创建了新的优化字典self.optim_configs,
形如{'b':{'learning_rate':0.0005},'w ' : {'learning_rate':0.0005}}
为每个模型参数制定了相同的超参数
'''
#第三部,定义我们 _step ,其仅在train()函数中调用
def _step(self):
'''
训练模式下,样本图片数据的一次正向和反向传播,并且更新模型参数一次
:return:
'''
num_train = self.X_train.shape[0] #训练样本的数据总数
batch_mask = np.random.choice(num_train, self.batch_size) # 随机取得输入神经元个数的 样本图片数据
X_batch = self.X_train[batch_mask] # 随机取得输入神经元个数的 样本图片数据
y_batch = self.y_train[batch_mask]
#数据通过神经网络后 得到 损失值和梯度字典
loss, grads = self.model.loss(X_batch, y_batch)
self.loss_history.append(loss) #把本次算得的损失值记录下来
#执行一次模型参数更新
for p,w in self.model.params.items():
dw = grads[p] #取出模型参数p 对应的梯度值
config = self.optim_confing[p] #取出模型参数p对应的优化超参数
next_w, next_config = self.update_rule(w, dw, config) #优化算法
self.model.params[p] = next_w
self.optim_confing[p] = next_config #新超参数替代 旧的,如动量v
#第四部,定义我们 check_accuracy()函数,其仅在train()函数中调用
def check_accuracy(self, X, y, num_samples=None, batch_size=100):
'''
根据某图片样本数据,计算它对应标签的 准确率
:param X:
:param y:
:param num_samples:
:param batch_size:
:return:
'''
N = X.shape[0]
if num_samples is not None and N > num_samples:
mask = np.random.choice(N, num_samples)
N = num_samples
X = X[mask]
y = y[mask]
#计算 predictions in batches
num_batches = N / batch_size
if N % batch_size == 0:
num_batches += 1
y_pred = []
for i in range(int(np.ceil(num_batches))):
start = i * batch_size
end = (i+1) * batch_size
scores = self.model.loss(X[start:end])
y_pred.append(np.argmax(scores, axis=1))
y_pred = np.hstack(y_pred)
# 水平堆叠,从 【 【1,8,9.。。】,【6,5,7.。。】】
# 变成 【1,8,9.。。,6,5,7。。。。】
acc = np.mean(y_pred == y)
return acc
#第五步,定义我们最重要的train()函数
def train(self):
'''
首先要确定下来总共要进行的得带的次数 num_iterations,
:return:
'''
num_train = self.X_train.shape[0]
iterations_per_epoch = max(num_train//self.batch_size,1) #每遍迭代的次数
num_iterations = self.num_epochs * iterations_per_epoch
'''
开始迭代循环!
'''
for t in range(num_iterations):
self._step()
'''
上面完成了一次神经网络的迭代,此时模型参数已经更新了一次
并且在.self.loss_history中添加了一个新的loss值
'''
if self.verbose and t %self.print_every==0 :
print('(Iteration %d / %d) loss :%f' %(t+1,num_iterations, self.loss_history[-1]))
'''
at the end of every epoch, increment the epoch counter and decay the learning rate
'''
epoch_end = (t+1) % iterations_per_epoch == 0
if epoch_end:
self.epoch +=1
for k in self.optim_confing:
self.optim_confing[k]['learning_rate'] *= self.lr_decay #第一遍之后开始,每遍给学习率自乘一个衰减率
#check train and val accuracy on the first iteration , the last
#iteration , and at the end of each epoch
first_it = (t == 0)
last_it =(t == num_iterations-1)
if first_it or last_it or epoch_end:
train_acc = self.check_accuracy(self.X_train, self.y_train, num_samples=1000)
val_acc = self.check_accuracy(self.X_val, self.y_val, num_samples=1000)
self.train_acc_history .append(train_acc)
self.val_acc_history .append(val_acc)
if self.verbose:
print('Epoch %d / %d train acc: %f; val_acc: %f'%(
self.epoch, self.num_epochs, train_acc, val_acc
))
if val_acc > self.best_val_acc:
self.best_val_acc = val_acc
self.best_params = {}
for k,v in self.model.params.items():
self.best_params[k] = v.copy()
'''
结束迭代循环
'''
self.model.params = self.best_params
import tensorflow as tf
import numpy as np
import cifar_read_data_sets
# Load the raw CIFAR-10 data.
cifar10_dir = '../cifar/cifar-10-batches-py'
X_train, y_train, X_test, y_test = cifar_read_data_sets.load_CIFAR10(cifar10_dir)
X_train = np.reshape(X_train, [X_train.shape[0],-1])
X_test = np.reshape(X_test, [X_test.shape[0],-1])
# print(X_train.shape)
data = {'X_train': X_train ,
'y_train': y_train ,
'X_val': X_test,
'y_val': y_test}
model = FullyConnectNet(hidden_dims=[100,], reg= 0.1)
solver = Solver(model, data, update_rule='sgd_momentum', optim_config={'learning_rate':1e-3,
},
lr_decay=0.95, #学习率的衰减速率
num_epochs=10,
batch_size=100,
print_every=100)
solver.train()
optim
import numpy as np
def sgd_momentum(w, dw, config=None):
'''
Performs stochastic gradient descent with momentum.
config format:
-learning_rate: scalar learning rate
-momentum: scalar 0-1
如果0,就等于sgd了
velocity: a numpy array of the same shape as w and dw used to store a moving average of
the gradients.
:param w:
:param dw:
:param config:
:return:
'''
if config is None:
config = {}
config.setdefault('learning_rate', 1e-2)
config.setdefault('momentum', 0.9)
v = config.get('velocity', np.zeros_like(w))
next_w = None
v = config['momentum'] * v - config['learning_rate'] * dw
next_w = dw + v
config['velocity'] = v
return next_w, config
整段代码摘自网易云,如有侵权,望告知,本人属学习目的。