1、预先知识学习
1、偏差与方差概念
直观看:红心是真实值,蓝色点是预测值
2、训练集、验证集、测试集概念
2、题目
题目:利用水库水位变化预测大坝的出水量
3、解题
第一步:尝试拟合出一条训练好的线性回归
1、引入数据:
data = sio.loadmat(path)
print(data.keys())
2、分别构造训练集、验证集、测试集
#训练集
X_train,y_train = data['X'],data['y']
print(X_train.shape,y_train.shape)
#验证集
X_val,y_val = data['Xval'],data['yval']
print(X_val.shape,y_val.shape)
#测试集
X_test,y_test = data['Xtest'],data['ytest']
print(X_test.shape,y_test.shape)
3、对上述三者加入偏置项
#X插入偏置项
X_train = np.insert(X_train,0,values=1,axis=1)
X_val = np.insert(X_val,0,values=1,axis=1)
X_test = np.insert(X_test,0,values=1,axis=1)
4、绘制散点图,数据可视化
#数据可视化
def plot_data(X_train,y_train):
fig, ax = plt.subplots()
ax.scatter(X_train[:,1],y_train)
ax.set(xlabel='change in water level(x)',
ylabel='water flowing out og the dam(y)')
# plt.show()
plot_data(X_train,y_train)
5、建立带有正则化的损失函数
def reg_cost(theta, X, y, lamda):
cost = np.sum(np.power((X @ theta - y.flatten()), 2))
reg = theta[1:] @ theta[1:] * lamda
return (cost + reg) / (2 * len(X))
#验证
theta = np.ones(X_train.shape[1])
lamda = 1
print(reg_cost(theta,X_train,y_train,lamda))
6、建立带有正则化的梯度函数
#梯度
def reg_gradient(theta,X,y,lamda):
grad = (X@theta-y.flatten())@X
reg = lamda*theta
reg[0]=0
return (grad+reg)/(len(X))
print(reg_gradient(theta,X_train,y_train,lamda))
7、对函数进行训练、优化
#进行训练,优化
def train_model(X,y,lamda):
theta = np.ones(X.shape[1])
res = so.minimize(fun=reg_cost,
x0=theta,
args=(X,y,lamda),
jac=reg_gradient,
method='TNC')
return res.x
theta_final = train_model(X_train,y_train,lamda=0)
8、可视化优化之后的情况
#进行训练,优化
def train_model(X,y,lamda):
theta = np.ones(X.shape[1])
res = so.minimize(fun=reg_cost,
x0=theta,
args=(X,y,lamda),
jac=reg_gradient,
method='TNC')
return res.x
theta_final = train_model(X_train,y_train,lamda=0)
plot_data(X_train,y_train)
plt.plot(X_train[:,1],X_train@theta_final,c='r')
plt.show()
初始化theta为全1,然后使用scipy.optimize中的minimize函数按梯度函数对代价进行最小化,得出优化之后的theta_final
结果:
9、重点:从1开始,逐渐增加样本个数,对其进行拟合,不断计算其代价函数,得出最终cost、val、test三者的变化曲线
#任务:训练样本从1开始递增进行训练,比较训练集和验证集上的损失函数的变化情况
def plot_learnning_curve(X_train,y_train,X_val,y_val,X_test,y_test,lamda):
x = range(1,len(X_train)+1)
trainning_costs = []
cv_costs = []
test_costs = []
for i in x:
res = train_model(X_train[:i,:],y_train[:i,:],lamda)
trainning_cost_i = reg_cost(res,X_train[:i,:],y_train[:i,:],lamda)
cv_cost = reg_cost(res,X_val,y_val,lamda)
test_cost = reg_cost(res,X_test,y_test,lamda)
trainning_costs.append(trainning_cost_i)
cv_costs.append(cv_cost)
test_costs.append(test_cost)
plt.plot(x,trainning_costs,label = 'trainning cost',c='r')
plt.plot(x,cv_costs,label='cv costs',c='g')
plt.plot(x,test_costs,label='test costs',c='b')
plt.legend()
plt.xlabel('number of trainning examples')
plt.ylabel('costs')
plt.show()
结果:
可以看出,随着样本数量的增加,验证集与测试集的代价逐渐降低,训练集的代价逐渐升高,最终二者趋于相等