Regularized Linear Regression and Bias v.s. Variance
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io as io
import scipy.optimize as opt
data = io.loadmat('D:/python/practise/sample/machine-learning-ex5/ex5/ex5data1.mat')
1 Regularized Linear Regression
1.1 visualizing the dataset
X, y, Xval, yval, Xtest, ytest = data['X'], data['y'], data['Xval'], data['yval'], data['Xtest'], data['ytest']
# 把y改成数组
y = y.reshape(-1)
yval = yval.reshape(-1)
ytest = ytest.reshape(-1)
fig, ax = plt.subplots(1, 3, figsize=(16, 5))
ax[0].scatter(x = X, y = y, marker = 'x', color = 'r')
ax[1].scatter(x = Xval, y = yval, marker = 'x', color = 'b')
ax[2].scatter(x = Xtest, y = ytest, marker = 'x', color = 'g')
ax[0].set_title('train')
ax[1].set_title('validation')
ax[2].set_title('test')
1.2 regularized linear regression cost function
J ( θ ) = 1 2 m ( ∑ i = 1 m ( h θ ( x ( i ) ) − y ( i ) ) 2 ) + λ 2 m ( ∑ j = 1 n θ j 2 ) J(\theta)=\frac{1}{2m}\left(\sum^m_{i=1}(h_{\theta}(x^{(i)})-y^{(i)})^2\right)+\frac{\lambda}{2m}\left(\sum^n_{j=1}\theta^2_j\right) J(θ)=2m1(i=1∑m(hθ(x(i))−y(i))2)+2mλ(j=1∑nθj2)
T i p s : h θ ( x ) = θ T X Tips:h_{\theta}(x)=\theta^TX Tips:hθ(x)=θTX
# 增加x0项
X_1 = np.insert(X, 0, 1, axis = 1)
Xval_1 = np.insert(Xval, 0, 1, axis = 1)
Xtest_1 = np.insert(Xtest, 0, 1, axis = 1)
def J_func(theta, x, y):
cost = np.square(x.dot(theta.T) - y).mean() / 2
return cost
def J_func_reg(theta, x, y, c=1):
theta_ = theta[1: ]
reg = c * theta_.dot(theta_.T) / (2 * len(x))
return J_func(theta, x, y) + reg
按文档theta取1,计算代价函数值
theta = np.ones(2) # 按照文档theta取1
J_func_reg(theta, X_1, y, c=1)
303.9931922202643
1.3 regularized linear regression gradient
∂
J
(
θ
)
∂
θ
0
=
1
m
∑
i
=
1
m
(
h
θ
(
x
(
i
)
)
−
y
(
i
)
)
x
j
(
i
)
−
−
−
−
−
−
−
−
f
o
r
:
j
=
0
\frac{\partial J(\theta)}{\partial \theta_0}=\frac{1}{m}\sum^m_{i=1}(h_{\theta}(x^{(i)})-y^{(i)})x^{(i)}_j--------for : j=0
∂θ0∂J(θ)=m1i=1∑m(hθ(x(i))−y(i))xj(i)−−−−−−−−for:j=0
∂
J
(
θ
)
∂
θ
j
=
(
1
m
∑
i
=
1
m
(
h
θ
(
x
(
i
)
)
−
y
(
i
)
)
x
j
(
i
)
)
+
λ
m
θ
j
−
−
−
−
f
o
r
:
j
≥
1
\frac{\partial J(\theta)}{\partial \theta_j}=\left(\frac{1}{m}\sum^m_{i=1}(h_{\theta}(x^{(i)})-y^{(i)})x^{(i)}_j \right)+\frac{\lambda}{m}\theta_j----for:j\geq1
∂θj∂J(θ)=(m1i=1∑m(hθ(x(i))−y(i))xj(i))+mλθj−−−−for:j≥1
def gradient_reg(theta, x, y, c=1):
theta_ = theta.copy()
theta_[0] = 0
gradient = ((x.dot(theta.T) - y).T.dot(x)) / len(x)
reg = (c / len(x)) * theta_
return gradient + reg
gradient_reg(theta, X_1, y, c=1)
array([-15.30301567, 598.25074417])
1.4 fitting linear regression
def optimize_tnc(x, y, c):
theta = np.zeros(x.shape[1])
result = opt.fmin_tnc(J_func_reg, x0 = theta, fprime = gradient_reg, args = (x, y, c))
return result
res = optimize_tnc(X_1, y, c=0)
res
(array([13.08790351, 0.36777923]), 9, 0)
plot the best fit line
plt.figure(figsize = (7, 7))
plt.plot(X_1[:, 1].reshape(-1), X_1.dot(res[0]))
plt.scatter(X_1[:, 1], y, marker = 'x', color = 'r')
plt.grid(True)
2 Bias-Variance
2.1 learning curves
Implement code to generate the learning curves.
For a training set size of i, you should use the first i examples and y.
After learning the
θ
\theta
θ parameters, you should compute the error on the training and cross validation sets. Recall that the training and CV crror for a dataset is defined as:
J
t
r
a
i
n
(
θ
)
=
1
2
m
[
∑
i
=
1
m
(
h
θ
(
x
(
i
)
)
−
y
(
i
)
)
2
]
J_{train}(\theta)=\frac{1}{2m}\left[\sum^{m}_{i=1}(h_{\theta}(x^{(i)})-y^{(i)})^2\right]
Jtrain(θ)=2m1[i=1∑m(hθ(x(i))−y(i))2]
J
c
v
(
θ
)
=
1
2
m
c
v
[
∑
i
=
1
m
c
v
(
h
θ
(
x
c
v
(
i
)
)
−
y
c
v
(
i
)
)
2
]
J_{cv}(\theta)=\frac{1}{2m_{cv}}\left[\sum^{m_{cv}}_{i=1}(h_{\theta}(x_{cv}^{(i)})-y_{cv}^{(i)})^2\right]
Jcv(θ)=2mcv1[i=1∑mcv(hθ(xcv(i))−ycv(i))2]
When you are computing the training set error, make sure you compute it on the training subset(i.e.,X(0:n, 😃 and y(0:n))(instead of the entire training set).
def learing_curve_points(Xtrain, ytrain, Xval, yval, c):
jtrain = []
jcv = []
for i in range(len(Xtrain)):
res = optimize_tnc(Xtrain[:(i+1), :], ytrain[:(i+1)], c)
jtrain.append(J_func(res[0], Xtrain[:(i+1), :], ytrain[:(i+1)]))
jcv.append(J_func(res[0], Xval, yval))
return np.array(jtrain), np.array(jcv)
jtrain_learn, jcv_learn = learing_curve_points(X_1, y, Xval_1, yval, c = 0)
plt.figure(figsize = (9, 6))
plt.plot(np.arange(1, 13), jtrain_learn, label='J_train')
plt.plot(np.arange(1, 13), jcv_learn, label='J_cv')
plt.xlim([0,12])
plt.ylim([0,225])
plt.xticks(np.arange(13))
plt.yticks(np.arange(225, step=25))
plt.grid(True)
plt.legend(loc='best')
plt.ylabel('error')
plt.xlabel('number of training example')
plt.title('Learning curve for linear regression', fontsize=18)
3 Polynomial regression
# 这次用np实现一元多项式生成
def generate_polynomial(x, power):
x_ = x.copy()
for i in range(2, power+1):
x_ = np.c_[x_, x_[:, 1]**i]
return x_
3.1 learning polynomial regression
将多项式化后的数据集进行标准化(见课时30:feature scaling)
– feature scaling in order to get gradient descent to run quite a lot faster(don’t apply to x0)
def normalize_scaling(x):
x_ = x.copy()
x_ = x_[:, 1:]
normal = (x_ - x_.mean(axis=0)) / x_.std(axis=0, ddof=1)
# ddof:Means Delta Degrees of Freedom. The divisor used in calculations is N - ddof, where N represents the number of elements. By default ddof is zero.
# ddof = 0时,是总体标准差的估计 ;ddof = 1时,是样本标准差
return np.insert(normal, 0, 1, axis=1), x_.mean(axis=0), x_.std(axis=0, ddof=1)
def like_normalize(x, mean, std):
x_ = x.copy()
x_ = x_[:, 1:]
thing = (x_ - mean) / std
return np.insert(thing, 0, 1, axis = 1)
#将训练数据做多项式化处理和标准化处理
X_1_poly = generate_polynomial(X_1, 6)
X_1_poly_norm, train_mean, train_std = normalize_scaling(X_1_poly)
AndrewNg文档中用8次幂,计算后无法和文档图形一致(查了几个人的作业发现都有这个问题,提到matlab和python优化算法的区别),因此采用6次幂,能够吻合
注意:画曲线时,轴延长线上的点计算
h
θ
(
x
)
h_\theta(x)
hθ(x)时,也要进行标准化
(
X
∗
=
X
−
μ
σ
)
(X^*=\frac{X-\mu}{\sigma})
(X∗=σX−μ),并且
μ
\mu
μ和
σ
\sigma
σ要全部用训练集的对应次幂
def poly_lambda(c):
res2 = optimize_tnc(X_1_poly_norm, y, c)
print('theta=', res2[0])
xs = np.linspace(-75, 55, 60).reshape((-1, 1))
xs_1 = np.insert(xs, 0, 1, axis = 1)
# 采用6次幂
xs_1_poly = generate_polynomial(xs_1, 6)
#用训练集的均值和标准差
thing = (xs_1_poly[:, 1:] - train_mean) / train_std
xs_1_poly_norm = np.insert(thing, 0, 1, axis = 1)
plt.figure(figsize = (7, 5))
plt.plot(xs.reshape(-1), xs_1_poly_norm.dot(res2[0]), 'b--')
plt.scatter(X_1[:, 1], y, marker = 'x', color = 'r')
plt.grid(True)
plt.title(f'Polynomial Regression(lambda={c})')
poly_lambda(c=0)
theta= [ 11.21758931 11.37072 13.43377357 10.74317307 -4.38682157
-11.9225753 -5.12273063]
Xval_1_poly_likenorm = like_normalize(generate_polynomial(Xval_1, 6), train_mean, train_std)
Xval_1_poly_likenorm.shape
(21, 7)
jtrain_learn, jcv_learn = learing_curve_points(X_1_poly_norm, y, Xval_1_poly_likenorm, yval, c = 0)
def plot_learnCurve_lambda(c):
jtrain_learn, jcv_learn = learing_curve_points(X_1_poly_norm, y, Xval_1_poly_likenorm, yval, c)
plt.figure(figsize = (9, 6))
plt.plot(np.arange(1, 13), jtrain_learn, label='J_train')
plt.plot(np.arange(1, 13), jcv_learn, label='J_cv')
plt.xlim([0,12])
plt.xticks(np.arange(13))
plt.grid(True)
plt.legend(loc='best')
plt.ylabel('error')
plt.xlabel('number of training example')
plt.title(f'Polynomial Regression Learning Curve(lambda={c})', fontsize=15)
plot_learnCurve_lambda(c=0)
3.2 adjusting the regularization parameter
try: λ = 1 , λ = 100 \lambda=1,\lambda=100 λ=1,λ=100
poly_lambda(1)
theta= [11.21758834 8.60998012 5.33437646 3.84079294 2.24113677 2.10180568
0.88936458]
poly_lambda(100)
theta= [11.21758934 0.98876222 0.30025153 0.77917852 0.11521524 0.59036753
-0.01185144]
plot_learnCurve_lambda(c=1)
plot_learnCurve_lambda(c=100)
3.3 selecting λ \lambda λ using a cross validation set
lambda_vec = [0, 0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10]
def single_lambda_fit(c):
res = optimize_tnc(X_1_poly_norm, y, c)
J_train = J_func(res[0], X_1_poly_norm, y)
J_cv = J_func(res[0], Xval_1_poly_likenorm, yval)
return J_train, J_cv
def more_lambda_fit(lambda_vec):
zipped = []
for c in lambda_vec:
element = single_lambda_fit(c)
zipped.append(element)
J_train, J_cv = zip(* zipped)
J_train = np.array(J_train)
J_cv = np.array(J_cv)
return J_train, J_cv
def lambda_curve(lambda_vec):
J_train, J_cv = more_lambda_fit(lambda_vec)
plt.figure(figsize = (9,6))
plt.plot(lambda_vec, J_train, color = 'b', label = 'J_train')
plt.plot(lambda_vec, J_cv, color = 'g', label = 'J_cv')
plt.xticks(np.arange(11))
plt.xlabel('lambda')
plt.ylabel('error')
plt.grid(True)
plt.title('Lambda Curve', fontsize = 14)
plt.legend()
lambda_curve(lambda_vec)
def gen_2times_lambdaVec(start):
vec = [0, start]
for i in range(1,11):
vec.append(vec[-1]*2)
return vec
# 按照课程中的要求,使λ成倍增长至10
times_lambdaVec = gen_2times_lambdaVec(0.01)
times_lambdaVec
[0, 0.01, 0.02, 0.04, 0.08, 0.16, 0.32, 0.64, 1.28, 2.56, 5.12, 10.24]
lambda_curve(times_lambdaVec)