Regularized Linear Regression and Bias vs.Variance
代码分析
在本练习中,我们将实现正则化线性回归,并使用它来研究具有不同偏差-方差特性的模型
首先导入类库
import numpy as np
import matplotlib.pyplot as plt
import scipy.io #Used to load the OCTAVE *.mat files
import scipy.optimize #fmin_cg to train the linear regression
import warnings
warnings.filterwarnings('ignore')#去除warning
%matplotlib inline
实现正则化线性回归模型
导入数据
datafile = 'data/ex5data1.mat'
mat = scipy.io.loadmat( datafile )
#训练集
X, y = mat['X'], mat['y']
#交叉验证集
Xval, yval = mat['Xval'], mat['yval']
#测试集
Xtest, ytest = mat['Xtest'], mat['ytest']
#在每个数据集前插入全为1的一列
X = np.insert(X ,0,1,axis=1)
Xval = np.insert(Xval ,0,1,axis=1)
Xtest = np.insert(Xtest,0,1,axis=1)
可视化数据
def plotData():
plt.figure(figsize=(8,5))
plt.ylabel('Water flowing out of the dam (y)')
plt.xlabel('Change in water level (x)')
plt.plot(X[:,1],y,'rx')#X[:,1]代表第1列
plt.grid(True)
plotData()
然后我们编写代价计算函数
首先是假设函数
#得到所有训练集的h值
def h(theta,X): #Linear hypothesis function
return np.dot(X,theta)
然后是代价计算函数
def computeCost(mytheta,myX,myy,mylambda=0.): #Cost functionmyX.shape[0]#trainnig set的大小
myh = h(mytheta,myX).reshape((m,1))
mycost = float((1./(2*m)) * np.dot((myh-myy).T,(myh-myy)))
#正则项
regterm = (float(mylambda)/(2*m)) * float(mytheta[1:].T.dot(mytheta[1:]))
return mycost + regterm
测试
# see an output of 303.993192
#初始化参数θ
mytheta = np.array([[1.],[1.]])
#计算cost值
print(computeCost(mytheta,X,y,mylambda=1.))
输出:303.9931922202643
好了,下一步我们要编写梯度下降算法中梯度计算的函数
#计算梯度grad,即计算偏导数(bias unit不需要正则化)
def computeGradient(mytheta,myX,myy,mylambda=0.):
mytheta = mytheta.reshape((mytheta.shape[0],1))
m = myX.shape[0]
#grad 的格式和myTheta一样 (2x1)
myh = h(mytheta,myX).reshape((m,1))
grad = (1./float(m))*myX.T.dot(h(mytheta,myX)-myy)
regterm = (float(mylambda)/m)*mytheta
regterm[0] = 0 #不用正则化 bias term
regterm.reshape((grad.shape[0],1))
return grad + regterm
#负责将numpy数组转化为列表
def computeGradientFlattened(mytheta,myX,myy,mylambda=0.):
return computeGradient(mytheta,myX,myy,mylambda=0.).flatten()
"""
flattened的作用
[[-15.30301567] --> [-15.30301567 598.16741084]
[598.25074417]]
"""
测试
# gradient of [-15.303016; 598.250744] (with lambda=1)
#初始化参数
mytheta = np.array([[1.],[1.]])
#计算梯度
print(computeGradient(mytheta,X,y,1.))
输出:[[-15.30301567]
[598.25074417]]
好了,下面开始编写优化参数theta的函数
#最优化参数θ
def optimizeTheta(myTheta_initial, myX,myy,mylambda=0.,print_output=True):
fit_theta = scipy.optimize.fmin_cg(computeCost,x0=myTheta_initial,\
fprime=computeGradientFlattened,\
args=(myX,myy,mylambda),\
disp=print_output,\
epsilon=1.49e-12,\
maxiter=1000)
fit_theta = fit_theta.reshape((myTheta_initial.shape[0],1))
return fit_theta
测试
mytheta = np.array([[1.],[1.]])
fit_theta = optimizeTheta(mytheta,X,y,0.)
print(fit_theta)
输出:
Optimization terminated successfully.
Current function value: 22.373906
Iterations: 18
Function evaluations: 28
Gradient evaluations: 28
[[13.08790734]
[ 0.36777925]]
优化成功
使用优化好的参数θ,plot拟合后的线性模型
plotData()
plt.plot(X[:,1],h(fit_theta,X).flatten())
可以看出此模型存在欠拟合问题
分析bias-variance
先plot该模型的学习曲线
def plotLearningCurve():
initial_theta = np.array([[1.],[1.]])
mym, error_train, error_val = [], [], []
for x in range(1,13,1):
train_subset = X[:x,:]
y_subset = y[:x]
mym.append(y_subset.shape[0])
#优化θ
fit_theta = optimizeTheta(initial_theta,train_subset,y_subset,mylambda=0.,print_output=False)
#计算训练集误差
error_train.append(computeCost(fit_theta,train_subset,y_subset,mylambda=0.))
#计算验证集误差
error_val.append(computeCost(fit_theta,Xval,yval,mylambda=0.))
plt.figure(figsize=(8,5))
plt.plot(mym,error_train,label='Train')
plt.plot(mym,error_val,label='Cross Validation')
plt.legend()
plt.title('Learning curve for linear regression')
plt.xlabel('Number of training examples')
plt.ylabel('Error')
plt.grid(True)
#"You can observe that both the train error and cross validation error are high
# when the number of training examples is increased. This reflects a high bias
# problem in the model – the linear regression model is too simple and is unable
# to fit our dataset well."
plotLearningCurve()
下图可以看出,当训练集的数目增加时,Jtrain(训练集误差)和Jcv(交叉验证集误差)都很大,这说明该模型存在高偏差(high bias)问题,即该模型过于简单,不能很好地拟合数据
于是我们打算使用多项式回归模型
首先,我们要处理训练集
#为X添加高阶项
def genPolyFeatures(myX,p):
newX = myX.copy()
#在X后插入几列多项式
for i in range(p):
dim = i+2
newX = np.insert(newX,newX.shape[1],np.power(newX[:,1],dim),axis=1)
return newX
#将特征标准化
def featureNormalize(myX):
Xnorm = myX.copy()
stored_feature_means = np.mean(Xnorm,axis=0) #column-by-column
#忽略第一列
Xnorm[:,1:] = Xnorm[:,1:] - stored_feature_means[1:]
stored_feature_stds = np.std(Xnorm,axis=0,ddof=1)#ddof为自由度,ddof=1表示无偏样本标准差
Xnorm[:,1:] = Xnorm[:,1:] / stored_feature_stds[1:]
return Xnorm, stored_feature_means, stored_feature_stds
好,下面训练新的模型
global_d = 5
#为X添加高阶项
newX = genPolyFeatures(X,global_d)
#标准化X
newX_norm, stored_means, stored_stds = featureNormalize(newX)
#初始化参数
mytheta = np.ones((newX_norm.shape[1],1))
#优化theta
fit_theta = optimizeTheta(mytheta,newX_norm,y,0.)
输出:
Optimization terminated successfully.
Current function value: 0.198053
Iterations: 158
Function evaluations: 300
Gradient evaluations: 300
查看新模型拟合情况
def plotFit(fit_theta,means,stds):
n_points_to_plot = 50
xvals = np.linspace(-55,55,n_points_to_plot)#-55到55,个数为50的列表
#初始化全为1的numpy数组(50,1)
xmat = np.ones((n_points_to_plot,1))
#xmat后面加上xvals这一列
xmat = np.insert(xmat,xmat.shape[1],xvals.T,axis=1)
#xmat添加高阶项
xmat = genPolyFeatures(xmat,len(fit_theta)-2)
#标准化
xmat[:,1:] = xmat[:,1:] - means[1:]
xmat[:,1:] = xmat[:,1:] / stds[1:]
plotData()
plt.plot(xvals,h(fit_theta,xmat),'b--')
plotFit(fit_theta,stored_means,stored_stds)
可以看出该模型出现了过拟合,在x较大的情况下会下降,不能很好的泛化。
我们来看看多项式模型的学习曲线
#查看多项式拟合的学习曲线
def plotPolyLearningCurve(mylambda=0.):
#初始化参数
initial_theta = np.ones((global_d+2,1))
mym, error_train, error_val = [], [], []
#为X添加高阶项后标准化,得到myXval
myXval, dummy1, dummy2 = featureNormalize(genPolyFeatures(Xval,global_d))
for x in range(1,13,1):
train_subset = X[:x,:]
y_subset = y[:x]
mym.append(y_subset.shape[0])
#处理训练集
train_subset = genPolyFeatures(train_subset,global_d)
train_subset, dummy1, dummy2 = featureNormalize(train_subset)
#优化θ
fit_theta = optimizeTheta(initial_theta,train_subset,y_subset,mylambda=mylambda,print_output=False)
#计算Jtrain,Jcv
error_train.append(computeCost(fit_theta,train_subset,y_subset,mylambda=mylambda))
error_val.append(computeCost(fit_theta,myXval,yval,mylambda=mylambda))
plt.figure(figsize=(8,5))
plt.plot(mym,error_train,label='Train')
plt.plot(mym,error_val,label='Cross Validation')
plt.legend()
plt.title('Polynomial Regression Learning Curve (lambda = 0)')
plt.xlabel('Number of training examples')
plt.ylabel('Error')
plt.ylim([0,150])
plt.grid(True)
plotPolyLearningCurve()
学习曲线显示Jtrain低,但Jcv高。Jtrain和Jcv之间存在差距,表明存在高方差(high variance)问题,当训练数据增大时,Jtrain基本不变,而Jcv减少,说明模型对训练数据的拟合度高
最后,我们来看看不同正则化参数λ对模型bias-variance问题的影响
看看λ=1时,多项式模型的拟合情况
#Try Lambda = 1
#初始化参数
mytheta = np.zeros((newX_norm.shape[1],1))
#优化θ
fit_theta = optimizeTheta(mytheta,newX_norm,y,1)
#plot拟合曲线
plotFit(fit_theta,stored_means,stored_stds)
#plot学习曲线
plotPolyLearningCurve(1.)
输出:
Current function value: 8.042488
Iterations: 5
Function evaluations: 75
Gradient evaluations: 64
模型拟合效果不错
Jcv和Jtrain都收敛到一个相对较低的值。这表明λ=1正则多项式回归模型不存在高偏差或高方差问题,所以λ为1的多项式模型挺不错
下面来看看λ=100时的多项式模型拟合情况
#Try Lambda = 100
mytheta = np.random.rand(newX_norm.shape[1],1)
#尝试以正则化参数λ=100来优化theta
fit_theta = optimizeTheta(mytheta,newX_norm,y,100.)
plotFit(fit_theta,stored_means,stored_stds)
输出:
Current function value: 132.037071
Iterations: 0
Function evaluations: 41
Gradient evaluations: 29
可以看出由于惩罚力度过高,该模型存在high bias问题,拟合效果相当糟糕
最后看看Jtrain和Jval关于λ的图像
#lambdas = [0., 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1., 3., 10.]
lambdas = np.linspace(0,10,100)
errors_train, errors_val = [], []
for mylambda in lambdas:
#规范化训练集
newXtrain = genPolyFeatures(X,global_d)
newXtrain_norm, dummy1, dummy2 = featureNormalize(newXtrain)
#规范化验证集
newXval = genPolyFeatures(Xval,global_d)
newXval_norm, dummy1, dummy2 = featureNormalize(newXval)
init_theta = np.ones((newX_norm.shape[1],1))
fit_theta = optimizeTheta(mytheta,newXtrain_norm,y,mylambda,False)
errors_train.append(computeCost(fit_theta,newXtrain_norm,y,mylambda=mylambda))
errors_val.append(computeCost(fit_theta,newXval_norm,yval,mylambda=mylambda))
plt.figure(figsize=(8,5))
plt.plot(lambdas,errors_train,label='Train')
plt.plot(lambdas,errors_val,label='Cross Validation')
plt.legend()
plt.xlabel('lambda')
plt.ylabel('Error')
plt.grid(True)
可以看出当λ较小时,容易产生过拟合问题,Jtrain较小,但是Jcv较大
当λ较大时,存在欠拟合问题,Jcv和Jtrain都较大
数据集
ex5data1.mat
这里偷个懒,可以上kaggle上查找数据集