1. 导入波士顿房价数据集
import numpy as np
import pandas as pd
data= pd. read_csv( "data/boston.csv" )
data. head( )
CRIM ZN INDUS CHAS NOX RM AGE DIS RAD TAX PTRATIO B LSTAT MEDV 0 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98 24.0 1 0.02731 0.0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14 21.6 2 0.02729 0.0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03 34.7 3 0.03237 0.0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94 33.4 4 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33 36.2
2. 实现线性回归:梯度下降
class LinearRegression :
"""线性回归算法:梯度下降法实现"""
def __init__ ( self, alpha, times) :
"""alpha: float 学习率。用来控制步长(权重调整的幅度)
times:int 循环迭代的次数"""
self. alpha= alpha
self. times= times
def fit ( self, X, y) :
X= np. asarray( X)
y= np. asarray( y)
self. w_= np. zeros( 1 + X. shape[ 1 ] )
self. loss_= [ ]
for i in range ( self. times) :
y_hat= np. dot( X, self. w_[ 1 : ] ) + self. w_[ 0 ]
error= y- y_hat
self. loss_. append( np. sum ( error** 2 ) / 2 )
self. w_[ 0 ] += self. alpha* np. sum ( error)
self. w_[ 1 : ] += self. alpha* np. dot( X. T, error)
def predict ( self, X) :
X= np. asarray( X)
result= np. dot( X, self. w_[ 1 : ] ) + self. w_[ 0 ]
return result
3.数据切分
t= data. sample( len ( data) , random_state= 666 )
X_train= t. iloc[ : 400 , : - 1 ]
y_train= t. iloc[ : 400 , - 1 ]
X_test= t. iloc[ 400 : , : - 1 ]
y_test= t. iloc[ 400 : , - 1 ]
4. 线性回归预测(没标准化)
my_reg= LinearRegression( alpha= 0.0005 , times= 20 )
my_reg. fit( X_train, y_train)
result= my_reg. predict( X_test)
display( my_reg. w_)
display( my_reg. loss_)
display( np. mean( ( result- y_test) ** 2 ) )
array([-6.59441004e+91, -1.40968186e+92, -6.83088551e+92, -7.91819869e+92,
-4.03805906e+90, -3.74461053e+91, -4.12255808e+92, -4.69487511e+93,
-2.37774656e+92, -7.33988713e+92, -2.93151961e+94, -1.23169166e+93,
-2.34499035e+94, -8.68295194e+92])
[118525.505,
363708382156678.0,
1.4463556286428806e+24,
5.755546457407455e+33,
2.290338437939112e+43,
9.11407858329823e+52,
3.6268189476416507e+62,
1.443241415877832e+72,
5.743175533643604e+81,
2.2854156516968003e+91,
9.094489051263666e+100,
3.619023569832781e+110,
1.4401393552929022e+120,
5.730831321331474e+129,
2.2805034466177901e+139,
9.074941624398248e+148,
3.6112449471793515e+158,
1.4370439621856128e+168,
5.718513641305657e+177,
2.2756017996876753e+187]
4.361920130943844e+194
5. 进行标准化处理
class StandardScaler :
"""该类对数据进行标准化处理"""
def fit ( self, X) :
"""根据传递的样本,计算每个特征列的均值和方差"""
X= np. asarray( X)
self. std_= np. std( X, axis= 0 )
self. mean_= np. mean( X, axis= 0 )
def transform ( self, X) :
"""对给定的数据X进行标准化处理。(将X的每一列都转化为标准正态分布:均值为1,方差为0"""
return ( X- self. mean_) / self. std_
def fit_transform ( self, X) :
"""对数据进行训练,并进行标准化处理"""
self. fit( X)
return self. transform( X)
6.线性回归预测:标准化处理
t= data. sample( len ( data) , random_state= 666 )
X_train= t. iloc[ : 400 , : - 1 ]
y_train= t. iloc[ : 400 , - 1 ]
X_test= t. iloc[ 400 : , : - 1 ]
y_test= t. iloc[ 400 : , - 1 ]
s= StandardScaler( )
X_train= s. fit_transform( X_train)
X_test= s. transform( X_test)
s2= StandardScaler( )
y_train= s2. fit_transform( y_train)
y_test= s2. transform( y_test)
reg= LinearRegression( alpha= 0.0005 , times= 20 )
reg. fit( X_train, y_train)
result1= reg. predict( X_test)
display( np. mean( ( result1- y_test) ** 2 ) )
display( reg. w_)
display( reg. loss_)
0.21492617454147625
array([ 5.64659430e-16, 1.95047851e-02, 4.17596271e-02, -6.72431209e-02,
1.08055415e-01, -1.00875900e-01, 3.19801377e-01, -2.18037208e-02,
-2.17426807e-01, 8.04056926e-02, -6.75273910e-02, -1.82418768e-01,
9.39966020e-02, -4.13050262e-01])
[200.0,
109.21432152379548,
87.0452598536069,
76.53800264630138,
70.55588355152493,
66.96828493244492,
64.73083847262916,
63.272165227255414,
62.27233491795431,
61.549981193179754,
61.00090033178433,
60.56421933693936,
60.203637354956086,
59.89695732596468,
59.63019822805497,
59.39425170905927,
59.18296126355835,
58.99200309779758,
58.81822318519731,
58.65923694602755]
7. 进行可视化展示
import matplotlib as mpl
import matplotlib. pyplot as plt
mpl. rcParams[ "font.family" ] = "SimHei"
mpl. rcParams[ "axes.unicode_minus" ] = False
plt. figure( figsize= ( 10 , 10 ) )
plt. plot( result1, "ro-" , label= "预测值" )
plt. plot( y_test. values, "go--" , label= "真实值" )
plt. title( "线性回归—梯度下降" )
plt. xlabel( "样本序号" )
plt. ylabel( "房价" )
plt. legend( )
<matplotlib.legend.Legend at 0x219b25f4648>
plt. plot( range ( 1 , reg. times+ 1 ) , reg. loss_, "ro-" )
[<matplotlib.lines.Line2D at 0x219b2792188>]
lr= LinearRegression( alpha= 0.0005 , times= 50 )
t= data. sample( len ( data) , random_state= 666 )
X_train= t. iloc[ : 400 , 5 : 6 ]
y_train= t. iloc[ : 400 , - 1 ]
X_test= t. iloc[ 400 : , 5 : 6 ]
y_test= t. iloc[ 400 : , - 1 ]
s1= StandardScaler( )
X_train= s1. fit_transform( X_train)
X_test= s1. transform( X_test)
s2= StandardScaler( )
y_train= s2. fit_transform( y_train)
y_test= s2. transform( y_test)
lr. fit( X_train, y_train)
result2= lr. predict( X_test)
display( np. mean( ( result2- y_test) ** 2 ) )
0.3576747881088171
plt. scatter( X_train[ "RM" ] , y_train)
display( lr. w_)
x= np. arange( - 5 , 5 , 0.1 )
y= 3.31734640e-16 + 6.62422223e-01 * x
plt. plot( x, lr. predict( x. reshape( - 1 , 1 ) ) , "r" )
array([3.31734640e-16, 6.62422223e-01])
[<matplotlib.lines.Line2D at 0x219af61f0c8>]