Programming Exercise 5:Regularized Linear Regression and Bias v.s.Variance
Python版本3.6
编译环境:anaconda Jupyter Notebook
链接:实验数据和实验指导书
提取码:i7co
本章课程笔记部分见:应用机器学习的建议(Advice for Applying Machine Learning)
在本练习中,我们要实现正则化的线性回归,并且使用它来学习在不同的方差和偏差性能的模型
%matplotlib inline
#IPython的内置magic函数,可以省掉plt.show(),在其他IDE中是不会支持的
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid",color_codes=True)
import scipy.io as sio
import scipy.optimize as opt
from sklearn.metrics import classification_report#这个包是评价报告
加载数据集和可视化
data = sio.loadmat('ex5data1.mat')
data
{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Fri Nov 4 22:27:26 2011',
'__version__': '1.0',
'__globals__': [],
'X': array([[-15.93675813],
[-29.15297922],
[ 36.18954863],
[ 37.49218733],
[-48.05882945],
[ -8.94145794],
[ 15.30779289],
[-34.70626581],
[ 1.38915437],
[-44.38375985],
[ 7.01350208],
[ 22.76274892]]),
'y': array([[ 2.13431051],
[ 1.17325668],
[34.35910918],
[36.83795516],
[ 2.80896507],
[ 2.12107248],
[14.71026831],
[ 2.61418439],
[ 3.74017167],
[ 3.73169131],
[ 7.62765885],
[22.7524283 ]]),
'Xtest': array([[-33.31800399],
[-37.91216403],
[-51.20693795],
[ -6.13259585],
[ 21.26118327],
[-40.31952949],
[-14.54153167],
[ 32.55976024],
[ 13.39343255],
[ 44.20988595],
[ -1.14267768],
[-12.76686065],
[ 34.05450539],
[ 39.22350028],
[ 1.97449674],
[ 29.6217551 ],
[-23.66962971],
[ -9.01180139],
[-55.94057091],
[-35.70859752],
[ 9.51020533]]),
'ytest': array([[ 3.31688953],
[ 5.39768952],
[ 0.13042984],
[ 6.1925982 ],
[17.08848712],
[ 0.79950805],
[ 2.82479183],
[28.62123334],
[17.04639081],
[55.38437334],
[ 4.07936733],
[ 8.27039793],
[31.32355102],
[39.15906103],
[ 8.08727989],
[24.11134389],
[ 2.4773548 ],
[ 6.56606472],
[ 6.0380888 ],
[ 4.69273956],
[10.83004606]]),
'Xval': array([[-16.74653578],
[-14.57747075],
[ 34.51575866],
[-47.01007574],
[ 36.97511905],
[-40.68611002],
[ -4.47201098],
[ 26.53363489],
[-42.7976831 ],
[ 25.37409938],
[-31.10955398],
[ 27.31176864],
[ -3.26386201],
[ -1.81827649],
[-40.7196624 ],
[-50.01324365],
[-17.41177155],
[ 3.5881937 ],
[ 7.08548026],
[ 46.28236902],
[ 14.61228909]]),
'yval': array([[ 4.17020201e+00],
[ 4.06726280e+00],
[ 3.18730676e+01],
[ 1.06236562e+01],
[ 3.18360213e+01],
[ 4.95936972e+00],
[ 4.45159880e+00],
[ 2.22763185e+01],
[-4.38738274e-05],
[ 2.05038016e+01],
[ 3.85834476e+00],
[ 1.93650529e+01],
[ 4.88376281e+00],
[ 1.10971588e+01],
[ 7.46170827e+00],
[ 1.47693464e+00],
[ 2.71916388e+00],
[ 1.09269007e+01],
[ 8.34871235e+00],
[ 5.27819280e+01],
[ 1.33573396e+01]])}
def load_data():
"""for ex5
d['X'] shape = (12, 1)
pandas has trouble taking this 2d ndarray to construct a dataframe, so I ravel
the results
"""
d = sio.loadmat('ex5data1.mat')
return map(np.ravel, [d['X'], d['y'], d['Xval'], d['yval'], d['Xtest'], d['ytest']])
X, y, Xval, yval, Xtest, ytest = load_data()
df = pd.DataFrame({
'water_level':X, 'flow':y})
sns.lmplot('water_level', 'flow', data=df, fit_reg =False, size=5)
<seaborn.axisgrid.FacetGrid at 0xd59b2e9400>
X, Xval, Xtest = [np.insert(x.reshape(x.shape[0], 1), 0, np.ones(x.shape[0]), axis=1) for x in (X, Xval, Xtest)]
代价函数
def cost(theta, X