正则化线性回归
这一部分,我们需要先对一个水库的流出水量以及水库水位进行正则化线性归回。然后将会探讨方差-偏差的问题
数据可视化
import numpy as np
import pandas as pd
import scipy.io as sio
import scipy.optimize as opt
import matplotlib.pyplot as plt
import seaborn as sns
data = sio.loadmat('ex5data1.mat')
X,y,Xval,yval,Xtest,ytest = map(np.ravel,[data['X'],data['y'],data['Xval'],data['yval'],data['Xtest'],data['ytest']])
X.shape,y.shape,Xval.shape,yval.shape,Xtest.shape,ytest.shape
((12,), (12,), (21,), (21,), (21,), (21,))
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X,y)
ax.set_xlabel('water level')
ax.set_ylabel('flow')
plt.show()
正则化线性回归代价函数
theta初始值为[1,1],输出应该为303.993
# 把各个X项中都加入常数项1
X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]
def cost(theta,X,y):
m = X.shape[0]
error = X@theta - y
J = error.T@error/(2*m)
return J
# X是m*n维,m个样本,n个特征
# y是m维,theta是n维
def costReg(theta,X,y,learning_rate):
m = X.shape[0]
error = X@theta - y #R(m*1)
reg = (learning_rate/(2*m)) * np.power(theta[1:],2).sum()
J = error.T@error/(2*m)
J += reg
return J
theta = np.ones(X.shape[1]) #初始化为1
learning_rate = 1
costReg(theta,X,y,learning_rate)
303.9931922202643
正则化线性回归梯度
设定初始值为[1,1],输出应该为[-15.30, 598.250]
def gradientReg(theta,X,y,learning_rate):
m = X.shape[0]
grad = X.T @ (X@theta-y) #(m,n).T @ (m,1) -> (n,1)
grad /= m
reg = theta.copy() #根据公式只需要更改常数项的theta即可
reg[0] = 0
reg = (learning_rate/m) * reg
grad += reg
return grad
gradientReg(theta