题目
实现正则化线性回归,使用其来研究具有不同偏差-方差属性的模型。
在练习的前半部分,您将使用水库水位的变化实现正则化线性回归来预测大坝的出水量。在下半部分中,您将对调试学习算法进行诊断,并检查偏差和方差的影响。
本次的数据是以.mat格式储存的,x表示水位的变化,y表示大坝的出水量。数据集共分为三部分:训练集(X, y)、交叉验证集(Xval, yval)和测试集(Xtest, ytest)。
原始数据
data=sio.loadmat('ex5data1.mat')
X_train,y_train=data['X'],data['y']#训练集 12,1
X_val,y_val=data['Xval'],data['yval']#验证集 21,1
X_test,y_test=data['Xtest'],data['ytest']#测试集 21,1
X_train=np.insert(X_train,0,1,axis=1)
X_val=np.insert(X_val,0,1,axis=1)
X_test=np.insert(X_test,0,1,axis=1)
def plot_data():
fig,ax=plt.subplots()
ax.scatter(X_train[:,1],y_train)
ax.set(xlabel='water level',
ylabel='water flowing out')
plt.show()
plot_data()
显示不同lmd取值下的学习曲线
lmd=0
过拟合
lmd=1
lmd=100
欠拟合
不同lmd取值下的损失
通过比较选取lmd=3为最佳
完整代码
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.io as sio
from scipy.optimize import minimize
data=sio.loadmat('ex5data1.mat')
X_train,y_train=data['X'],data['y']#训练集 12,1
X_val,y_val=data['Xval'],data['yval']#验证集 21,1
X_test,y_test=data['Xtest'],data['ytest']#测试集 21,1
X_train=np.insert(X_train,0,1,axis=1)
X_val=np.insert(X_val,0,1,axis=1)
X_test=np.insert(X_test,0,1,axis=1)
def plot_data():
fig,ax=plt.subplots()
ax.scatter(X_train[:,1],y_train)
ax.set(xlabel='water level',
ylabel='water flowing out')
plt.show()
#plot_data()
#损失函数 正则化
def reg_cost(theta,X,y,lamda):
cost=np.sum(np.power((X@theta-y.flatten()),2))#对y进行降维
reg=np.sum(np.power(theta[1:],2))*lamda
return (cost+reg)/(2*len(X))
#梯度函数
def reg_gradient(theta,X,y,lamda):
grad=(X@theta-y.flatten())@X#注意这里y要降成一维
reg=lamda*theta
reg[0]=0
return (grad+reg)/(len(X))
#训练参数
def train_model(X,y,lmd):
theta=np.ones(X.shape[1])#初值为q
res=minimize(fun=reg_cost,x0=theta,args=(X,y,lmd),method='TNC',jac=reg_gradient)
return res.x#一定要返回X
#theta_final=train_model(X_train,y_train,lmd=0)
#print(theta_final)[13.08790362 0.36777923]
#
# fig,ax=plt.subplots()
# plt.scatter(X_train[:,1],y_train,c='r',marker='x')
# plt.xlabel('Change in water level(x)')
# plt.ylabel('Water flowing out of the dam(y)')
# plt.plot(X_train[:,1],X_train@theta_final,c='b')
# plt.show()
def plot_learining_curve(X_train,y_train,X_val,y_val,lmd):
train_cost=[]
cv_cost=[]
x=range(1,len(X_train+1))#x为训练集的长度 range(start,stop[,step])
for i in x:# 1-13
res=train_model(X_train[:i,:],y_train[:i,:],lmd)
training_cost_i=reg_cost(res,X_train[:i,:],y_train[:i,:],lmd)
cv_cost_i=reg_cost(res,X_val,y_val,lmd)#验证集
train_cost.append(training_cost_i)
cv_cost.append(cv_cost_i)
plt.plot(x,train_cost,label='training cost')
plt.plot(x,cv_cost,label='cv cost')
plt.legend(loc=1)
plt.xlabel('number of training examples')
plt.ylabel('costs')
plt.show()
#不考虑正则化的情况,出现欠拟合
#plot_learining_curve(X_train,y_train,X_val,y_val,lmd=0)
def poly_feature(X,power):
for i in range(2,power+1):#从二次方到power次方
X=np.insert(X,X.shape[1],np.power(X[:,1],i),axis=1)#从第二列开始插入多列多项式
return X
#获取均值和方差
def get_standard(X):
means=np.mean(X,axis=0)#按行计算。
stds=np.std(X,axis=0)
return means,stds
#标准化
def feature_normalize(X,means,stds):
X[:,1:]=(X[:,1:]-means[1:])/stds[1:]#取所有行,去掉第一列
return X
# 测试,最大六次方
power = 6
#特征映射
X_train_poly=poly_feature(X_train,power)
X_val_poly=poly_feature(X_val,power)
X_test_poly=poly_feature(X_test,power)#通过多项式特征映射,
# 将原始的一维特征(水位的变化)转化为具有更高次方的多项式特征。
# 这样做的目的是通过引入更多的特征来拟合非线性数据,提高模型的灵活性和泛化能力。
#获取均值和方差 标准化
train_means,train_stds=get_standard(X_train_poly)
X_train_norm=feature_normalize(X_train_poly,train_means,train_stds)
X_val_norm=feature_normalize(X_val_poly,train_means,train_stds)
X_test_norm=feature_normalize(X_test_poly,train_means,train_stds)
theta_fit=train_model(X_train_norm,y_train,lmd=0)
#print(theta_fit)[11.21758918 13.34694771 7.10824447]
def plot_poly_fit():
plot_data()
x=np.linspace(-60,60,100)#生产网格数据
xReshape=x.reshape(100,1)
xReshape=np.insert(xReshape,0,1,axis=1)
xReshape=poly_feature(xReshape,power)
xReshape=feature_normalize(xReshape,train_means,train_stds)#标准化
plt.plot(x,xReshape@theta_fit,'r--')#曲线
plt.show()
#plot_poly_fit()
#显示不同lmd取值下的学习曲线
#未加入正则化 高方差,次数太多,过拟合
plot_learining_curve(X_train_norm,y_train,X_val_norm,y_val,lmd=0)
#加入正则化
plot_learining_curve(X_train_norm,y_train,X_val_norm,y_val,lmd=1)
#正则化过大,欠拟合
plot_learining_curve(X_train_norm,y_train,X_val_norm,y_val,lmd=100)
#三者比较 lmd=1最佳
#使用交叉验证集选择最佳的lmd
lamdas=[0,0.001,0.003,0.01,0.03,0.1,0.3,1,3,10]
training_cost=[]
cv_cost=[]
for lmd in lamdas:
res=train_model(X_train_norm,y_train,lmd)
tc=reg_cost(res,X_train_norm,y_train,lamda=0)#计算训练集上的误差
cv=reg_cost(res,X_val_norm,y_val,lamda=0)#计算验证集上的误差
training_cost.append(tc)
cv_cost.append(cv)
plt.plot(lamdas,training_cost,label='training cost')#不同lmd取值下训练集的损失
plt.plot(lamdas,cv_cost,label='cv cost')
plt.legend()
plt.show()
bestLamda=lamdas[np.argmin(cv_cost)]
print(bestLamda)#3
res=train_model(X_train_norm,y_train,bestLamda)
print(reg_cost(res,X_test_norm,y_test,lamda=0))#4.397616217271634
plot_learining_curve(X_train_norm,y_train,X_val_norm,y_val,lmd=3)