训练线性模型
# - * - coding: utf- 8 - * -
"" "
Created on Mon Jan 7 19 : 32 : 56 2019
@author: Administrator
"" "
'' 'Linear regression using the Normal Equation' ''
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# numpy中有一些常用的用来产生随机数的函数,randn ( ) 和rand ( ) 就属于这其中。
# numpy. random. randn ( d0, d1, …, dn) 是从标准正态分布中返回一个或多个样本值。
# numpy. random. rand ( d0, d1, …, dn) 的随机样本位于[ 0 , 1 ) 中。
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# 正态方程
# 为了找到最小化损失函数的 值,可以采用公式解,换句话说,就是可以通过解正态方程直
# 接得到最后的结果。
# 公式 4 - 4 :正态方程theta= np. linalg. inv ( ( X . T * X ) ) * X . T * y
# 指最小化损失 的值
# 是一个向量,其包含了 到 的值
# 让我们生成一些近似线性的数据(如图 4 - 1 )来测试一下这个方程。
#随机线性数据集
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
import numpy as np
import matplotlib. pyplot as plt
import os
X = 2 * np. random. rand ( 100 , 1 ) #100 X1, 数值区间[ 0 , 1 )
y = 4 + 3 * X + np. random. randn ( 100 , 1 )
PROJECT_ROOT_DIR = 'E:\wuxian python\handson-ml-master\handson-ml-master\datasets\Training_Linear_Model'
def save_fig ( fig_id, tight_layout= True) :
path = os. path. join ( PROJECT_ROOT_DIR , fig_id + ".png" )
print ( 'Saving figure' , fig_id)
if tight_layout:
plt. tight_layout ( ) #紧凑显示图片
plt. savefig ( path, format = 'png' , dpi = 300 )
plt. plot ( X , y, 'b.' ) #plt. plot ( x, y, format_string, ** kwargs) , b为颜色字符,b表示蓝色, b. 表示蓝色散点图,把点. 去掉折线图
plt. xlabel ( '$x_1$' , fontsize= 18 )
plt. ylabel ( '$y$' , rotation= 0 , fontsize= 18 )
plt. axis ( [ 0 , 2 , 0 , 15 ] )
save_fig ( 'generated_data_plot' )
plt. show ( )
#现在让我们使用正态方程来计算 ,我们将使用 Numpy 的线性代数模块( np. linalg )中
#的 inv ( ) 函数来计算矩阵的逆,以及 dot ( ) 方法来计算矩阵的乘法。
X_b = np. c_[ np. ones ( ( 100 , 1 ) ) , X ] #add x0= 1 to each instance
theta_best = np. linalg. inv ( X_b. T . dot ( X_b) ) . dot ( X_b. T ) . dot ( y)
theta_best
#我们希望最后得到的参数为theta0= 4 , theta1= 3 , 由于存在噪声,参数不可能达到到原
#始函数的值。
#现在现在我们能够使用 theta来进行预测:
X_new = np. array ( [ [ 0 ] , [ 2 ] ] )
X_new_b = np. c_[ np. ones ( ( 2 , 1 ) ) , X_new]
y_predict = X_new_b. dot ( theta_best)
y_predict
plt. plot ( X_new, y_predict, 'r-' )
plt. plot ( X , y, 'b.' )
plt. axis ( [ 0 , 2 , 0 , 15 ] )
plt. show ( )
plt. plot ( X_new, y_predict, 'r-' , linewidth= 2 , label= 'Predictions' )
plt. plot ( X , y, 'b.' )
plt. xlabel ( '$X_1$' , fontsize= 18 )
plt. ylabel ( '$y$' , rotation= 0 , fontsize= 18 )
plt. legend ( loc= 'upper left' , fontsize= 14 )
plt. axis ( [ 0 , 2 , 0 , 15 ] )
save_fig ( 'linear_model_predictions' )
plt. show ( )
#使用下面的 Scikit- Learn 代码可以达到相同的效果:
from sklearn. linear_model import LinearRegression
lin_reg = LinearRegression ( )
lin_reg. fit ( X , y)
#模型系数结果
lin_reg. intercept_, lin_reg. coef_
lin_reg. predict ( X_new)
theta_best_svd, residuals, rank, s= np. linalg. lstsq ( X_b, y, rcond= 1e-6 )
theta_best_svd
np. linalg. pinv ( X_b) . dot ( y)
'' 'Linear regression using batch gradient descent' ''
#梯度下降 批量梯度下降相当于对损失函数theta求偏导
#批量梯度下降:每一次训练过程都使用所有的的训练数据。因此,在大数据集上,其会变得相
#当的慢
###########梯度下降步长#############
eta = 0.1 #学习率
n_iterations = 1000
m = 100
theta = np. random. randn ( 2 , 1 )
for interation in range ( n_iterations) :
gradients = 2 / m* X_b. T . dot ( X_b. dot ( theta) - y)
theta = theta - eta* gradients
###########梯度下降步长#############
theta
X_new_b. dot ( theta)
theta_path_bgd = [ ]
def plot_gradient_descent ( theta, eta, theta_path= None) :
m = len ( X_b)
plt. plot ( X , y, 'b.' )
n_iterations = 1000
for iteration in range ( n_iterations) :
if iteration < 10 :
y_predict = X_new_b. dot ( theta)
style = 'b-' if interation > 0 else "r--"
plt. plot ( X_new, y_predict, style)
gradients = 2 / m* X_b. T . dot ( X_b. dot ( theta) - y)
theta = theta - eta* gradients
if theta_path is not None:
theta_path. append ( theta)
plt. xlabel ( '$x_1$' , fontsize= 18 )
plt. axis ( [ 0 , 2 , 0 , 15 ] )
plt. title ( r'$\eta={}$' . format ( eta) , fontsize= 16 )
np. random. seed ( 42 )
theta = np. random. randn ( 2 , 1 )
plt. figure ( figsize= ( 10 , 4 ) )
plt. subplot ( 131 ) ; plot_gradient_descent ( theta, eta= 0.02 )
plt. ylabel ( '$y$' , rotation= 0 , fontsize= 18 )
plt. subplot ( 132 ) ; plot_gradient_descent ( theta, eta= 0.1 , theta_path= theta_path_bgd)
plt. subplot ( 133 ) ; plot_gradient_descent ( theta, eta= 0.5 )
save_fig ( 'gradient_descent_plot' )
plt. show ( )
'' ' Stochastic Gradient Descent' ''
'' '随机梯度下降
当损失函数很不规则时(如图 4 - 6 ),随机梯度下降算法能够跳过局部最小值。因此,随机梯
度下降在寻找全局最小值上比批量梯度下降表现要好
虽然随机性可以很好的跳过局部最优值,但同时它却不能达到最小值。解决这个难题的一个
办法是逐渐降低学习率。 开始时,走的每一步较大(这有助于快速前进同时跳过局部最小
值),然后变得越来越小,从而使算法到达全局最小值。
'' '
theta_path_sgd = [ ]
m = len ( X_b)
np. random. seed ( 42 )
n_epochs = 50
t0, t1 = 5 , 50 #learning_schedule的超参数
def learning_schedule ( t) :
return t0/ ( t+ t1)
theta = np. random. randn ( 2 , 1 )
for epoch in range ( n_epochs) :
for i in range ( m) :
if epoch == 0 and i < 20 :
y_predict = X_new_b. dot ( theta)
style = 'b-' if i > 0 else 'r--'
plt. plot ( X_new, y_predict, style)
random_index = np. random. randint ( m)
xi = X_b[ random_index: random_index+ 1 ]
yi = y[ random_index: random_index+ 1 ]
gradients = 2 * xi. T . dot ( xi. dot ( theta) - yi)
eta = learning_schedule ( epoch* m+ i)
theta = theta- eta* gradients
theta_path_sgd. append ( theta)
theta
plt. plot ( X , y, 'b.' )
plt. xlabel ( '$x_1$' , fontsize= 18 )
plt. ylabel ( '$y$' , rotation= 0 , fontsize= 18 )
plt. axis ( [ 0 , 2 , 0 , 15 ] )
save_fig ( 'sgd_plot' )
plt. show ( )
theta
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# 通过使用 Scikit- Learn 完成线性回归的随机梯度下降,你需要使用 SGDRegressor 类,这个类
# 默认优化的是均方差损失函数。下面的代码迭代了 50 代,其学习率 为0.1 ( eta0= 0.1 ),
# 使用默认的 learning schedule (与前面的不一样),同时也没有添加任何正则项
# ( penalty = None ):
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
from sklearn. linear_model import SGDRegressor
sgd_reg = SGDRegressor ( max_iter= 50 , penalty= None, eta0= 0.1 , random_state= 42 )
sgd_reg. fit ( X , y. ravel ( ) )
#你可以再一次发现,这个结果非常的接近正态方程的解:
sgd_reg. intercept_, sgd_reg. coef_
'' 'Mini_batch gradient descent' ''
#小批量梯度下降
'' '
在迭代的每一步,批量梯度使用整个
训练集,随机梯度时候用仅仅一个实例,在小批量梯度下降中,它则使用一个随机的小型实例集
'' '
theta_path_mgd = [ ]
n_iteration = 50
minibatch_size = 20
np. random. seed ( 42 )
theta = np. random. randn ( 2 , 1 )
t0, t1 = 200 , 1000
def learning_schedule ( t) :
return t0/ ( t+ t1)
t = 0
for epoch in range ( n_iterations) :
shuffled_indices = np. random. permutation ( m)
X_b_shuffled = X_b[ shuffled_indices]
y_shuffled = y[ shuffled_indices]
for i in range ( 0 , m, minibatch_size) :
t += 1
xi = X_b_shuffled[ i: i+ minibatch_size]
yi = y_shuffled[ i: i+ minibatch_size]
gradients = 2 / minibatch_size* xi. T . dot ( xi. dot ( theta) - yi)
eta = learning_schedule ( t)
theta = theta - eta* gradients
theta_path_mgd. append ( theta)
theta
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
#训练期间三种梯度下降算法在参数空间中所采用的路径。 他们都接近最小值,
# 但批量梯度的路径最后停在了最小值,而随机梯度和小批量梯度最后都在最小值附近摆动。
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
theta_path_bgd = np. array ( theta_path_bgd)
theta_path_sgd = np. array ( theta_path_sgd)
theta_path_mgd = np. array ( theta_path_mgd)
plt. figure ( figsize= ( 7 , 4 ) )
plt. plot ( theta_path_sgd[ : , 0 ] , theta_path_sgd[ : , 1 ] , 'r-s' , linewidth= 1 , label= 'Stochastic' )
plt. plot ( theta_path_mgd[ : , 0 ] , theta_path_mgd[ : , 1 ] , 'g-+' , linewidth= 2 , label= 'Mini-batch' )
plt. plot ( theta_path_bgd[ : , 0 ] , theta_path_bgd[ : , 1 ] , 'b-o' , linewidth= 3 , label= 'Batch' )
plt. legend ( loc= 'upper_left' , fontsize= 16 )
plt. xlabel ( r'$\theta_0$' , fontsize= 20 )
plt. ylabel ( r"$\theta_1$ " , fontsize= 20 , rotation= 0 )
plt. axis ( [ 2.5 , 4.5 , 2.3 , 3.9 ] )
save_fig ( 'grandient_descent_paths_plot' )
plt. show ( )
'' 'Polynomial regression' ''
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# 如果你的数据实际上比简单的直线更复杂呢? 令人惊讶的是,你依然可以使用线性模型来拟
# 合非线性数据。 一个简单的方法是对每个特征进行加权后作为新的特征,然后训练一个线性
# 模型在这个扩展的特征集。 这种方法称为多项式回归
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
import numpy as np
import numpy. random as rnd
np. random. seed ( 42 )
m = 100
X = 6 * np. random. rand ( m, 1 ) - 3
y = 0.5 * X ** 2 + X + 2 + np. random. randn ( m, 1 )
plt. plot ( X , y, 'b.' )
plt. xlabel ( '$x_1$' , fontsize= 18 )
plt. ylabel ( '$y$' , rotation= 0 , fontsize= 18 )
plt. axis ( [ - 3 , 3 , 0 , 10 ] )
save_fig ( 'quadratic_data_plot' )
plt. show ( )
from sklearn. preprocessing import PolynomialFeatures
poly_features = PolynomialFeatures ( degree= 2 , include_bias= False)
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# PolynomialFeatures ( degree= d) 把一个包含 个特征的数组转换为一个包含 特
# 征的数组, 表示 的阶乘,等于 。小心大量特征的组合爆炸!
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
X_poly = poly_features. fit_transform ( X ) #X_poly 现在包含原始特征x 并加上了这个特征的平方 。
X [ 0 ]
X_poly[ 0 ]
X_poly[ 0 ] [ 1 ]
lin_reg = LinearRegression ( )
lin_reg. fit ( X_poly, y)
lin_reg. intercept_, lin_reg. coef_ #常数 #系数
X_new = np. linspace ( - 3 , 3 , 100 ) . reshape ( 100 , 1 )
X_new_poly = poly_features. transform ( X_new)
y_new = lin_reg. predict ( X_new_poly)
plt. plot ( X , y, 'b.' )
plt. plot ( X_new, y_new, 'r-' , linewidth= 2 , label= 'Predictions' )
plt. xlabel ( "$x_1$" , fontsize= 18 )
plt. ylabel ( "$y$" , rotation= 0 , fontsize= 18 )
plt. legend ( loc= 'upper left' , fontsize= 14 )
plt. axis ( [ - 3 , 3 , 0 , 10 ] )
save_fig ( 'quadratic_predictions_plot' )
plt. show ( )
from sklearn. preprocessing import StandardScaler
from sklearn. pipeline import Pipeline
for style, width, degree in ( ( "g--" , 1 , 300 ) , ( "b--" , 2 , 3 ) , ( "r-+" , 2 , 1 ) ) :
polybig_features = PolynomialFeatures ( degree= degree, include_bias= False)
std_scaler = StandardScaler ( )
lin_reg = LinearRegression ( )
polynomial_regression = Pipeline ( [
( "poly_feature" , polybig_features) ,
( "std_scaler" , std_scaler) ,
( "lin_reg" , lin_reg) ,
] )
polynomial_regression. fit ( X , y)
y_newbig = polynomial_regression. predict ( X_new)
plt. plot ( X_new, y_newbig, style, label= str ( degree) , linewidth= width)
plt. plot ( X , y, "b." , linewidth= 3 )
plt. legend ( loc= 'upper left' )
plt. xlabel ( "$x_1$" , fontsize= 18 )
plt. ylabel ( "$y$" , rotation= 0 , fontsize= 18 )
plt. axis ( [ - 3 , 3 , 0 , 10 ] )
save_fig ( "high_degree_polynomials_plot" )
plt. show ( )
'' '
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# 在第二章,你可以使用交叉验证来估计一个模型的泛化能力。如果一个模型在训练集上表现
# 良好,通过交叉验证指标却得出其泛化能力很差,那么你的模型就是过拟合了。如果在这两
# 方面都表现不好,那么它就是欠拟合了。这种方法可以告诉我们,你的模型是太复杂还是太
# 简单了。
另一种方法是观察学习曲线:画出模型在训练集上的表现,同时画出以训练集规模为自变量
的训练集函数。为了得到图像,需要在训练集的不同规模子集上进行多次训练。下面的代码
定义了一个函数,用来画出给定训练集后的模型学习曲线:
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
'' '
from sklearn. metrics import mean_squared_error
from sklearn. model_selection import train_test_split
def plot_learning_curves ( model, X , y) :
X_train, X_val, y_train, y_val = train_test_split ( X , y, test_size= 0.2 , random_state= 10 )
train_errors, val_errors = [ ] , [ ]
for m in range ( 1 , len ( X_train) ) :
model. fit ( X_train[ : m] , y_train[ : m] )
y_train_predict = model. predict ( X_train[ : m] )
y_val_predict = model. predict ( X_val)
train_errors. append ( mean_squared_error ( y_train[ : m] , y_train_predict) )
val_errors. append ( mean_squared_error ( y_val, y_val_predict) )
# print ( train_errors, val_errors)
plt. plot ( np. sqrt ( train_errors) , 'r-+' , linewidth= 2 , label= 'train' )
plt. plot ( np. sqrt ( val_errors) , 'b-' , linewidth= 3 , label= 'val' )
plt. legend ( loc= 'upper right' , fontsize= 14 )
plt. xlabel ( 'Training set size' , fontsize= 14 )
plt. ylabel ( 'RMSE' , fontsize= 14 )
# return train_errors, val_errors
lin_reg = LinearRegression ( )
plot_learning_curves ( lin_reg, X , y)
#print ( plot_learning_curves ( lin_reg, X , y) . train_errors)
plt. axis ( [ 0 , 80 , 0 , 3 ] )
save_fig ( 'underfitting_learning_curves_plot' )
plt. show ( )
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
#学习曲线
# 这幅图值得我们深究。首先,我们观察训练集的表现:当训练集只有一两个样本的时候,模
# 型能够非常好的拟合它们,这也是为什么曲线是从零开始的原因。但是当加入了一些新的样
# 本的时候,训练集上的拟合程度变得难以接受,出现这种情况有两个原因,一是因为数据中
# 含有噪声,另一个是数据根本不是线性的。因此随着数据规模的增大,误差也会一直增大,
# 直到达到高原地带并趋于稳定,在之后,继续加入新的样本,模型的平均误差不会变得更好
# 或者更差。我们继续来看模型在验证集上的表现,当以非常少的样本去训练时,模型不能恰
# 当的泛化,也就是为什么验证误差一开始是非常大的。当训练样本变多的到时候,模型学习
# 的东西变多,验证误差开始缓慢的下降。但是一条直线不可能很好的拟合这些数据,因此最
# 后误差会到达在一个高原地带并趋于稳定,最后和训练集的曲线非常接近。
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
#现在让我们看一个在相同数据上10 阶多项式模型拟合的学习曲线(图 4 - 16 ):
from sklearn. pipeline import Pipeline
polynomial_regression = Pipeline ( [
( 'poly_features' , PolynomialFeatures ( degree= 10 , include_bias= False) ) ,
( 'line_reg' , LinearRegression ( ) ) ,
] )
plot_learning_curves ( polynomial_regression, X , y)
plt. axis ( [ 0 , 80 , 0 , 3 ] )
save_fig ( "learning_curves_plot" )
plt. show ( )
#在训练集上,误差要比线性回归模型低的多。
#图中的两条曲线之间有间隔,这意味模型在训练集上的表现要比验证集上好的多,这也
#是模型过拟合的显著特点。当然,如果你使用了更大的训练数据,这两条曲线最后会非常的接近。
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# 偏差:泛化误差的这部分误差是由于错误的假设决定的。例如实际是一个二次模
# 型,你却假设了一个线性模型。一个高偏差的模型最容易出现欠拟合。
# 方差:这部分误差是由于模型对训练数据的微小变化较为敏感,一个多自由度的模
# 型更容易有高的方差(例如一个高阶多项式模型),因此会导致模型过拟合。
# 不可约误差:这部分误差是由于数据本身的噪声决定的。降低这部分误差的唯一方
# 法就是进行数据清洗(例如:修复数据源,修复坏的传感器,识别和剔除异常
# 值)。
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
"" "Regularized models
线性模型的正则化
"" "
#训练时的损失函数应该在优化过程中易于求导,而在测试过程
#中,评价函数更应该接近最后的客观表现。
'' 'Ridge regression 岭回归' ''
from sklearn. linear_model import Ridge
np. random. seed ( 42 )
m = 20
X = 3 * np. random. rand ( m, 1 )
y = 1 + 0.5 * X + np. random. randn ( m, 1 ) / 1.5
X_new = np. linspace ( 0 , 3 , 100 ) . reshape ( 100 , 1 )
def plot_model ( model_class, polynomial, alphas, ** model_kargs) :
for alpha, style in zip ( alphas, ( 'b-' , 'g--' , 'r:' ) ) :
model = model_class ( alpha, ** model_kargs) if alpha> 0 else LinearRegression ( )
if polynomial:
model = Pipeline ( [
( 'poly_features' , PolynomialFeatures ( degree= 10 , include_bias= False) ) ,
( 'std_scaler' , StandardScaler ( ) ) ,
( 'regul_reg' , model) ,
] )
model. fit ( X , y)
y_new_regul = model. predict ( X_new)
lw = 2 if alpha> 0 else 1
plt. plot ( X_new, y_new_regul, style, linewidth= lw, label= r"$\alpha={}$" . format ( alpha) )
plt. plot ( X , y, 'b.' , linewidth= 3 )
plt. legend ( loc= 'upper left' , fontsize= 15 )
plt. xlabel ( '$x_1$' , fontsize= 18 )
plt. axis ( [ 0 , 3 , 0 , 4 ] )
plt. figure ( figsize= ( 8 , 4 ) )
plt. subplot ( 121 )
plot_model ( Ridge, polynomial= False, alphas= ( 0 , 10 , 100 ) , random_state= 42 )
plt. ylabel ( '$y$' , rotation= 0 , fontsize= 18 )
plt. subplot ( 122 )
plot_model ( Ridge, polynomial= True, alphas= ( 0 , 10 ** - 5 , 1 ) , random_state= 42 )
save_fig ( "ridge_regression_plot" )
plt. show ( )
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# 在相同线性数据上使用不同 值的岭回归模型最后的表现。左图中,使用简单
# 的岭回归模型,最后得到了线性的预测。右图中的数据首先使用 10 阶
# 的 PolynomialFearures 进行扩展,然后使用 StandardScaler 进行缩放,最后将岭模型应用在
# 处理过后的特征上。这就是带有岭正则项的多项式回归。注意当 增大的时候,导致预测曲线
# 变得扁平(即少了极端值,多了一般值),这样减少了模型的方差,却增加了模型的偏差。
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
'' '下面是如何使用 Scikit- Learn 来进行封闭方程的求解(使用 Cholesky 法进行矩阵分解对公式
4 - 9 进行变形)'' '
from sklearn. linear_model import Ridge
ridge_reg = Ridge ( alpha= 1 , solver= 'cholesky' , random_state= 42 )
ridge_reg. fit ( X , y)
ridge_reg. predict ( [ [ 1.5 ] ] )
#使用随机梯度法进行求解:
sgd_reg = SGDRegressor ( max_iter= 5 , penalty= 'l2' , random_state= 42 ) #penalty= 'l2'
sgd_reg. fit ( X , y. ravel ( ) )
sgd_reg. predict ( [ [ 1.5 ] ] )
#penalty 参数指的是正则项的惩罚类型。指定“l2”表明你要在损失函数上添加一项:权重向量
#范数平方的一半,这就是简单的岭回归
ridge_reg = Ridge ( alpha= 1 , solver= 'sag' , random_state= 42 )
ridge_reg. fit ( X , y)
ridge_reg. predict ( [ [ 1.5 ] ] )
#Lasso回归
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# Lasso 回归(也称 Least Absolute Shrinkage,或者 Selection Operator Regression)是另一
# 种正则化版的线性回归:就像岭回归那样,它也在损失函数上添加了一个正则化项,但是它
# 使用权重向量的 范数而不是权重向量 范数平方的一半。(如公式 4 - 10
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
from sklearn. linear_model import Lasso
plt. figure ( figsize= ( 8 , 4 ) )
plt. subplot ( 121 )
plot_model ( Lasso, polynomial= False, alphas= ( 0 , 0.1 , 1 ) , random_state= 42 )
plt. subplot ( 122 )
plot_model ( Lasso, polynomial= True, alphas= ( 0 , 10 ** - 7 , 1 ) , tol= 1 , random_state= 42 )
save_fig ( 'lasso_regression_plot' )
plt. show ( )
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# Lasso 回归的一个重要特征是它倾向于完全消除最不重要的特征的权重(即将它们设置为
# 零)。例如,右图中的虚线所示( ),曲线看起来像一条二次曲线,而且几乎是线
# 性的,这是因为所有的高阶多项特征都被设置为零。换句话说,Lasso回归自动的进行特征选
# 择同时输出一个稀疏模型(即,具有很少的非零权重)
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
from sklearn. linear_model import Lasso
lasso_reg = Lasso ( alpha= 0.1 )
lasso_reg. fit ( X , y)
lasso_reg. predict ( [ [ 1.5 ] ] )
from sklearn. linear_model import ElasticNet
elastic_net = ElasticNet ( alpha= 0.1 , l1_ratio= 0.5 , random_state= 42 )
elastic_net. fit ( X , y)
elastic_net. predict ( [ [ 1.5 ] ] )
####################
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# 对于迭代学习算法,有一种非常特殊的正则化方法,就像梯度下降在验证错误达到最小值时
# 立即停止训练那样。我们称为早期停止法。图 4 - 20 表示使用批量梯度下降来训练一个非常复
# 杂的模型(一个高阶多项式回归模型)。随着训练的进行,算法一直学习,它在训练集上的
# 预测误差(RMSE )自然而然的下降。然而一段时间后,验证误差停止下降,并开始上升。这
# 意味着模型在训练集上开始出现过拟合。一旦验证错误达到最小值,便提早停止训练。这种
# 简单有效的正则化方法被 Geoffrey Hinton 称为“完美的免费午餐
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
np. random. seed ( 42 )
m = 100
X = 6 * np. random. rand ( m, 1 ) - 3
y = 2 + X + 0.5 * X ** 2 + np. random. randn ( m, 1 )
X_train, X_val, y_train, y_val = train_test_split ( X [ : 50 ] , y[ : 50 ] . ravel ( ) , test_size= 0.5 , burandom_state= 10 )
poly_scaler = Pipeline ( [
( 'poly_featires' , PolynomialFeatures ( degree= 90 , include_bias= False) ) ,
( 'std_scaler' , StandardScaler ( ) ) ,
] )
X_train_poly_scaled = poly_scaler. fit_transform ( X_train)
X_val_poly_scaled = poly_scaler. transform ( X_val)
sgd_reg = SGDRegressor ( max_iter= 1 ,
penalty= None,
eta0= 0.0005 ,
warm_start= True,
learning_rate= 'constant' ,
random_state= 42 )
n_epochs = 500
train_errors, val_errors= [ ] , [ ]
for epoch in range ( n_epochs) :
sgd_reg. fit ( X_train_poly_scaled, y_train)
y_train_predict = sgd_reg. predict ( X_train_poly_scaled)
y_val_predict = sgd_reg. predict ( X_val_poly_scaled)
train_errors. append ( mean_squared_error ( y_train, y_train_predict) )
val_errors. append ( mean_squared_error ( y_val, y_val_predict) )
best_epoch = np. argmin ( val_errors)
best_val_rmse = np. sqrt ( val_errors[ best_epoch] )
plt. annotate ( 'Best_model' ,
xy= ( best_epoch, best_val_rmse) ,
xytext= ( best_epoch, best_val_rmse+ 1 ) ,
ha= 'center' ,
arrowprops= dict ( facecolor= 'black' , shrink= 0.05 ) ,
fontsize= 16 ,
)
best_val_rmse -= 0.03
plt. plot ( [ 0 , n_epochs] , [ best_val_rmse, best_val_rmse] , 'k:' , linewidth= 2 )
plt. plot ( np. sqrt ( val_errors) , 'b-' , linewidth= 3 , label= 'Validation set' )
plt. plot ( np. sqrt ( train_errors) , 'r--' , linewidth= 2 , label= 'Training set' )
plt. legend ( loc= 'upper right' , fontsize= 14 )
plt. xlabel ( 'Epoch' , fontsize= 14 )
plt. ylabel ( 'RMSE' , fontsize= 14 )
save_fig ( 'early_stopping_plot' )
plt. show ( )
#下面是一个早期停止法的基础应用:
from sklearn. base import clone
sgd_reg = SGDRegressor ( max_iter= 1 , warm_start= True, penalty= None,
learning_rate= 'constant' , eta0= 0.0005 , random_state= 42 )
minimum_val_error = float ( 'inf' )
best_epoch = None
best_model = None
for epoch in range ( 1000 ) :
sgd_reg. fit ( X_train_poly_scaled, y_train)
y_val_predict = sgd_reg. predict ( X_val_poly_scaled)
val_error = mean_squared_error ( y_val, y_val_predict)
if val_error < minimum_val_error:
minimum_val_error = val_error
best_epoch = epoch
best_model = clone ( sgd_reg)
best_epoch, best_model
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# 你可以从图 4 - 19 知道为什么会出现这种情况:在左上角图中,后背景的等高线(椭圆)表示
# 了没有正则化的均方差损失函数( ),白色的小圆圈表示在当前损失函数上批量梯度下
# 降的路径。前背景的等高线(菱形)表示 惩罚,黄色的三角形表示了仅在这个惩罚下批量梯
# 度下降的路径( )。注意路径第一次是如何到达 ,然后向下滚动直到它到达
# 。在右上角图中,等高线表示的是相同损失函数再加上一个 的 惩罚。这幅
# 图中,它的全局最小值在 这根轴上。批量梯度下降首先到达 ,然后向下滚动
# 直到达到全局最小值。 两个底部图显示了相同的情况,只是使用了 惩罚。 规则化的最小
# 值比非规范化的最小值更接近于 ,但权重不能完全消除。
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
#% matplotlib inline #Jupter编辑时候会用
import matplotlib. pyplot as plt
import numpy as np
t1a, t1b, t2a, t2b = - 1 , 3 , - 1.5 , 1.5
t1s = np. linspace ( t1a, t1b, 500 )
t2s = np. linspace ( t2a, t2b, 500 )
t1, t2 = np. meshgrid ( t1s, t2s)
T = np. c_[ t1. ravel ( ) , t2. ravel ( ) ]
Xr = np. array ( [ [ - 1 , 1 ] , [ - 0.3 , - 1 ] , [ 1 , 0.1 ] ] )
yr = 2 * Xr[ : , : 1 ] + 0.5 * Xr[ : , 1 : ]
J = ( 1 / len ( Xr) * np. sum ( ( T . dot ( Xr. T ) - yr. T ) ** 2 , axis= 1 ) ) . reshape ( t1. shape)
N1 = np. linalg. norm ( T , ord= 1 , axis= 1 ) . reshape ( t1. shape)
N2 = np. linalg. norm ( T , ord= 2 , axis= 1 ) . reshape ( t1. shape)
t_min_idx = np. unravel_index ( np. argmin ( J ) , J . shape)
t1_min, t2_min = t1[ t_min_idx] , t2[ t_min_idx]
t_init = np. array ( [ [ 0.25 ] , [ - 1 ] ] )
def bgd_path ( theta, X , y, l1, l2, core= 1 , eta= 0.1 , n_iterations= 50 ) :
path = [ theta]
for interation in range ( n_iterations) :
gradients = core* 2 / len ( X ) * X . T . dot ( X . dot ( theta) - y) + 11 * np. sign ( theta) + 2 * 12 * theta
theta = theta- eta* gradients
path. append ( theta)
return np. array ( path)
plt. figure ( figsize= ( 12 , 8 ) )
for i, N , l1, l2, title in ( ( 0 , N1 , 0.5 , 0 , 'Lasso' ) , ( 1 , N2 , 0 , 0.1 , 'Ridge' ) ) :
JR = J + l1* N1 + l2* N2 ** 2
tr_min_idx = np. unravel_index ( np. argmin ( JR ) , JR . shape)
t1r_min, t2r_min = t1[ tr_min_idx] , t2[ tr_min_idx]
levelsJ= ( np. exp ( np. linspace ( 0 , 1 , 20 ) ) - 1 ) * ( np. max ( J ) - np. min ( J ) ) + np. min ( J )
levelsJR= ( np. exp ( np. linspace ( 0 , 1 , 20 ) ) - 1 ) * ( np. max ( JR ) - np. min ( JR ) ) + np. min ( JR )
levelsN= np. linspace ( 0 , np. max ( N ) , 10 )
path_J = bgd_path ( t_init, Xr, yr, l1= 0 , l2= 0 )
path_JR = bgd_path ( t_init, Xr, yr, 11 , 12 )
path_N = bgd_path ( t_init, Xr, yr, np. sign ( 11 ) / 3 , np. sign ( 12 ) , core= 0 )
plt. subplot ( 221 + i* 2 )
plt. grid ( True)
plt. axhline ( y= 0 , color= 'k' )
plt. axvline ( x= 0 , color= 'k' )
plt. contourf ( t1, t2, J , levels= levelsJ, alpha= 0.9 )
plt. contour ( t1, t2, N , levels= levelsN)
plt. plot ( path_J[ : , 0 ] , path_J[ : , 1 ] , 'w-o' )
plt. plot ( path_N[ : , 0 ] , path_N[ : , 1 ] , 'y-^' )
plt. plot ( t1_min, t2_min, 'rs' )
plt. title ( r'$\ell_{}$ penalty' . format ( i+ 1 ) , fontsize= 16 )
plt. axis ( [ t1a, t1b, t2a, t2b] )
if i == 1 :
plt. xlabel ( r"$\theta_1$" , fontsize= 20 )
plt. ylabel ( r"$\theta_2$" , fontsize= 20 , rotation= 0 )
plt. subplot ( 222 + i* 2 )
plt. grid ( True)
plt. axhline ( y= 0 , color= 'k' )
plt. axvline ( x= 0 , color= 'k' )
plt. contourf ( t1, t2, JR , levels= levelsJR, alpha= 0.9 )
plt. plot ( path_JR[ : , 0 ] , path_JR[ : , 1 ] , 'w-o' )
plt. plot ( t1r_min, t2r_min, 'rs' )
plt. title ( title, fontsize= 16 )
plt. axis ( [ t1a, t1b, t2a, t2b] )
if i== 1 :
plt. xlabel ( r'$\theta_1$' , fontsize= 20 )
save_fig ( 'lasso_vs_ridge_plot' )
plt. show ( )
'' 'Logistic Regression' ''
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# 这个损失函数是合理的,因为当 接近 0 时, 变得非常大,所以如果模型估计一个正
# 例概率接近于 0 ,那么损失函数将会很大,同时如果模型估计一个负例的概率接近 1 ,那么损
# 失函数同样会很大。 另一方面,当 接近于 1 时, 接近 0 ,所以如果模型估计一个正
# 例概率接近于 0 ,那么损失函数接近于 0 ,同时如果模型估计一个负例的概率接近 0 ,那么损
# 失函数同样会接近于 0 , 这正是我们想的
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
# 但是这个损失函数对于求解最小化损失函数的 是没有公式解的(没有等价的正态方程)。
# 但好消息是,这个损失函数是凸的,所以梯度下降(或任何其他优化算法)一定能够找到全
# 局最小值(如果学习速率不是太大,并且你等待足够长的时间)。公式 4 - 18 给出了损失函数
# 关于第 个模型参数 的偏导数
# === === === === === === === === === === === === === === === === === === === === === === === === === ==
t = np. linspace ( - 10 , 10 , 100 )
sig = 1 / ( 1 + np. exp ( - t) )
plt. figure ( figsize= ( 9 , 3 ) )
plt. plot ( [ - 10 , 10 ] , [ 0 , 0 ] , 'k-' )
plt. plot ( [ - 10 , 10 ] , [ 0.5 , 0.5 ] , 'k:' )
plt. plot ( [ - 10 , 10 ] , [ 1 , 1 ] , 'k:' )
plt. plot ( [ 0 , 0 ] , [ - 1.1 , 1.1 ] , 'k-' )
plt. plot ( t, sig, 'b-' , linewidth= 2 , label= r'$\sigma(t)=\frac{1}{1+e^{-t}}$' )
plt. xlabel ( 't' )
plt. legend ( loc= 'upper left' , fontsize= 20 )
plt. axis ( [ - 10 , 10 , - 0.1 , 1.1 ] )
save_fig ( 'logistic_function_plot' )
plt. show ( )
from sklearn import datasets
iris = datasets. load_iris ( )
list ( iris. keys ( ) )
print ( iris. DESCR )
X = iris[ 'data' ] [ : , 3 : ]
y = ( iris[ 'target' ] == 2 ) . astype ( np. int)
from sklearn. linear_model import LogisticRegression
log_reg = LogisticRegression ( random_state= 42 )
log_reg. fit ( X , y)
X_new = np. linspace ( 0 , 3 , 1000 ) . reshape ( - 1 , 1 )
y_proba = log_reg. predict_proba ( X_new)
plt. plot ( X_new, y_proba[ : , 1 ] , 'g-' , linewidth= 2 , label= 'Iris-Virginica' )
plt. plot ( X_new, y_proba[ : , 0 ] , 'b--' , linewidth= 2 , label= 'Not Iris-Virginica' )
X_new = np. linspace ( 0 , 3 , 1000 ) . reshape ( - 1 , 1 )
y_proba = log_reg. predict_proba ( X_new)
decision_boundary = X_new[ y_proba[ : , 1 ] >= 0.5 ] [ 0 ]
plt. figure ( figsize = ( 8 , 3 ) )
plt. plot ( X [ y== 0 ] , y[ y== 0 ] , 'bs' )
plt. plot ( X [ y== 1 ] , y[ y== 1 ] , 'g^' )
plt. plot ( [ decision_boundary, decision_boundary] , [ - 1 , 2 ] , 'k:' , linewidth= 2 )
plt. plot ( X_new, y_proba[ : , 1 ] , 'g-' , linewidth= 2 , label= 'Iris-Virginica' )
plt. plot ( X_new, y_proba[ : , 0 ] , 'b--' , linewidth= 2 , label= 'Not Iris-Virginica' )
plt. text ( decision_boundary+ 0.02 , 0.15 , 'Decision boundary' , fontsize= 14 , color= 'k' , ha= 'center' )
plt. arrow ( decision_boundary, 0.08 , - 0.3 , 0 , head_width= 0.05 , head_length= 0.1 , fc= 'b' , ec= 'b' )
plt. arrow ( decision_boundary, 0.92 , 0.3 , 0 , head_width= 0.05 , head_length= 0.1 , fc= 'g' , ec= 'g' )
plt. xlabel ( 'Petal width(cm)' , fontsize= 14 )
plt. ylabel ( 'Probability' , fontsize= 14 )
plt. axis ( [ 0 , 3 , - 0.02 , 1.02 ] )
save_fig ( 'logistic_regression_plot' )
plt. show ( )
decision_boundary
log_reg. predict ( [ [ 1.7 ] , [ 1.5 ] ] )
from sklearn. linear_model import LogisticRegression
X = iris[ 'data' ] [ : , ( 2 , 3 ) ]
y = ( iris[ 'target' ] == 2 ) . astype ( np. int)
log_reg = LogisticRegression ( C = 10 ** 10 , random_state= 42 )
log_reg. fit ( X , y)
x0, x1 = np. meshgrid (
np. linspace ( 2.9 , 7 , 500 ) . reshape ( - 1 , 1 ) ,
np. linspace ( 0.8 , 2.7 , 200 ) . reshape ( - 1 , 1 ) ,
)
X_new = np. c_[ x0. ravel ( ) , x1. ravel ( ) ]
y_proba = log_reg. predict_proba ( X_new)
plt. figure ( figsize= ( 10 , 4 ) )
plt. plot ( X [ y== 0 , 0 ] , X [ y== 0 , 1 ] , 'bs' )
plt. plot ( X [ y== 1 , 0 ] , X [ y== 1 , 1 ] , 'g^' )
zz = y_proba[ : , 1 ] . reshape ( x0. shape)
contour = plt. contour ( x0, x1, zz, cmap= plt. cm. brg)
left_right = np. array ( [ 2.9 , 7 ] )
boundary = - ( log_reg. coef_[ 0 ] [ 0 ] * left_right+ log_reg. intercept_[ 0 ] ) / log_reg. coef_[ 0 ] [ 1 ]
plt. clabel ( contour, inline= 1 , fontsize= 12 )
plt. plot ( left_right, boundary, 'k--' , linewidth= 3 )
plt. text ( 3.5 , 1.5 , 'Not Iris-Virginica' , fontsize= 14 , color= 'b' , ha= 'center' )
plt. text ( 6.5 , 2.3 , 'Iris-Virginica' , fontsize= 14 , color= 'g' , ha= 'center' )
plt. xlabel ( "Petal length" , fontsize= 14 )
plt. ylabel ( "Petal width" , fontsize= 14 )
plt. axis ( [ 2.9 , 7 , 0.8 , 2.7 ] )
save_fig ( "logistic_regression_contour_plot" )
plt. show ( )
X = iris[ 'data' ] [ : , ( 2 , 3 ) ]
y = iris[ 'target' ]
softmax_reg = LogisticRegression ( multi_class= 'multinomial' , solver= 'lbfgs' , C = 10 , random_state= 42 )
softmax_reg. fit ( X , y)
##########
x0, x1 = np. meshgrid (
np. linspace ( 0 , 8 , 500 ) . reshape ( - 1 , 1 ) ,
np. linspace ( 0 , 3.5 , 200 ) . reshape ( - 1 , 1 ) ,
)
X_new = np. c_[ x0. ravel ( ) , x1. ravel ( ) ]
y_proba = softmax_reg. predict_proba ( X_new)
y_predict = softmax_reg. predict ( X_new)
zz1 = y_proba[ : , 1 ] . reshape ( x0. shape)
zz = y_predict. reshape ( x0. shape)
plt. figure ( figsize= ( 10 , 4 ) )
plt. plot ( X [ y== 2 , 0 ] , X [ y== 2 , 1 ] , "g^" , label= "Iris-Virginica" )
plt. plot ( X [ y== 1 , 0 ] , X [ y== 1 , 1 ] , "bs" , label= "Iris-Versicolor" )
plt. plot ( X [ y== 0 , 0 ] , X [ y== 0 , 1 ] , "yo" , label= "Iris-Setosa" )
from matplotlib. colors import ListedColormap
custom_cmap = ListedColormap ( [ '#fafab0' , '#9898ff' , '#a0faa0' ] )
plt. contourf ( x0, x1, zz, cmap= custom_cmap)
contour = plt. contour ( x0, x1, zz1, cmap= plt. cm. brg)
plt. clabel ( contour, inline= 1 , fontsize= 12 )
plt. xlabel ( "Petal length" , fontsize= 14 )
plt. ylabel ( "Petal width" , fontsize= 14 )
plt. legend ( loc= "center left" , fontsize= 14 )
plt. axis ( [ 0 , 7 , 0 , 3.5 ] )
save_fig ( "softmax_regression_contour_plot" )
plt. show ( )