import numpy as np
import pandas as pd
import matplotlib. pyplot as plt
import statsmodels. api as sm
import statsmodels. stats. outliers_influence
from prettytable import PrettyTable
from statsmodels. stats. outliers_influence import variance_inflation_factor
from sklearn. decomposition import PCA
10.2 PRINCIPAL COMPONENTS REGRESSION
table 10.1-10.2
data= pd. read_csv( "C:/Users/可乐怪/Desktop/csv/P229.csv" )
n= len ( data)
meanVal= np. mean( data. iloc[ 0 : 11 , : ] , axis= 0 )
CenteringMat= data. iloc[ 0 : 11 , : ] - meanVal
StdVal= np. std( data. iloc[ 0 : 11 , : ] , axis= 0 )
StdVal= ( StdVal* np. sqrt( n) ) / np. sqrt( n- 1 )
s_data= CenteringMat/ StdVal
s_data[ 'constant' ] = 1
model2= sm. OLS( s_data[ 'import' ] , s_data[ [ 'constant' , 'doprod' , 'stock' , 'consum' ] ] ) . fit( )
print ( model2. summary( ) )
OLS Regression Results
==============================================================================
Dep. Variable: import R-squared: 0.992
Model: OLS Adj. R-squared: 0.988
Method: Least Squares F-statistic: 285.6
Date: Thu, 25 Nov 2021 Prob (F-statistic): 1.11e-07
Time: 14:50:53 Log-Likelihood: 11.191
No. Observations: 11 AIC: -14.38
Df Residuals: 7 BIC: -12.79
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
constant 1.388e-16 0.033 4.2e-15 1.000 -0.078 0.078
doprod -0.3393 0.464 -0.731 0.488 -1.437 0.758
stock 0.2130 0.034 6.203 0.000 0.132 0.294
consum 1.3027 0.464 2.807 0.026 0.205 2.400
==============================================================================
Omnibus: 0.241 Durbin-Watson: 2.740
Prob(Omnibus): 0.887 Jarque-Bera (JB): 0.361
Skew: 0.259 Prob(JB): 0.835
Kurtosis: 2.279 Cond. No. 27.3
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
D:\Anaconda\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=11
warnings.warn("kurtosistest only valid for n>=20 ... continuing "
c_data3= - PCA( 3 ) . fit_transform( s_data. iloc[ : , 2 : 5 ] )
data3= pd. DataFrame( c_data3)
data3. columns= [ "c1" , "c2" , "c3" ]
model3= sm. OLS( s_data[ 'import' ] , data3) . fit( )
print ( model3. summary( ) )
OLS Regression Results
=======================================================================================
Dep. Variable: import R-squared (uncentered): 0.992
Model: OLS Adj. R-squared (uncentered): 0.989
Method: Least Squares F-statistic: 326.4
Date: Thu, 25 Nov 2021 Prob (F-statistic): 1.06e-08
Time: 14:50:53 Log-Likelihood: 11.191
No. Observations: 11 AIC: -16.38
Df Residuals: 8 BIC: -15.19
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
c1 0.6900 0.023 30.653 0.000 0.638 0.742
c2 0.1913 0.032 6.005 0.000 0.118 0.265
c3 1.1597 0.614 1.890 0.095 -0.255 2.574
==============================================================================
Omnibus: 0.241 Durbin-Watson: 2.740
Prob(Omnibus): 0.887 Jarque-Bera (JB): 0.361
Skew: 0.259 Prob(JB): 0.835
Kurtosis: 2.279 Cond. No. 27.3
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
D:\Anaconda\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=11
warnings.warn("kurtosistest only valid for n>=20 ... continuing "
10.3 REMOVING DEPENDENCE AMONG THE PREDICTORS
First PC
from itertools import chain
c_data1= - PCA( 1 ) . fit_transform( s_data. iloc[ : , 2 : 5 ] )
data1= pd. DataFrame( c_data1)
data1. columns= [ "c1" ]
α1 = np. array( sm. OLS( s_data[ 'import' ] , data1) . fit( ) . params)
eigVals, eigVectors= np. linalg. eig( s_data. iloc[ : , 2 : 5 ] . cov( ) )
θ1 = np. round ( ( eigVectors. T[ 0 ] * α1 ) . reshape( 3 , 1 ) , 2 )
θ1
array([[0.49],
[0.03],
[0.49]])
coeff= [ StdVal[ 1 ] / StdVal[ i] for i in range ( 2 , 5 ) ]
c1_β= np. round ( np. array( coeff) . reshape( 3 , 1 ) * θ1 , 2 )
c1_β
array([[0.07],
[0.08],
[0.11]])
c1_β0 = meanVal[ 1 ] - ( np. array( meanVal[ 2 : 5 ] ) . reshape( 1 , 3 ) @c1_β)
c1_β0 [ 0 , 0 ]
-7.3654545454545435
First and Second
c_data2= - PCA( 2 ) . fit_transform( s_data. iloc[ : , 2 : 5 ] )
data2= pd. DataFrame( c_data2)
data2. columns= [ "c1" , 'c2' ]
α2 = np. array( sm. OLS( s_data[ 'import' ] , data2) . fit( ) . params)
eigVectors. T[ 2 ] = - eigVectors. T[ 2 ]
θ= eigVectors. T[ : : 2 ] * α2. reshape( 2 , 1 )
θ2 = np. round ( ( θ[ 0 ] + θ[ 1 ] ) . reshape( 3 , 1 ) , 2 )
θ2
array([[0.48],
[0.22],
[0.48]])
c2_β= np. round ( np. array( coeff) . reshape( 3 , 1 ) * θ2 , 2 )
c2_β
array([[0.07],
[0.61],
[0.11]])
c2_β0 = meanVal[ 1 ] - ( np. array( meanVal[ 2 : 5 ] ) . reshape( 1 , 3 ) @c2_β)
c2_β0 [ 0 , 0 ]
-9.114454545454542
All PCs
θ3 = np. round ( np. array( model2. params[ 1 : ] ) . reshape( 3 , 1 ) , 2 )
θ3
array([[-0.34],
[ 0.21],
[ 1.3 ]])
c3_β= np. round ( np. array( coeff) . reshape( 3 , 1 ) * θ3 , 2 )
c3_β
array([[-0.05],
[ 0.58],
[ 0.29]])
c3_β0 = meanVal[ 1 ] - ( np. array( meanVal[ 2 : 5 ] ) . reshape( 1 , 3 ) @c3_β)
c3_β0 [ 0 , 0 ]
-10.8170909090909
table= pd. DataFrame( columns= [ 'Variable' , 'Stand.1' , ' Original' , 'Stand.2' , ' Origina2' , 'Stand.3' , ' Origina3' ] )
table= pd. DataFrame( { 'Variable' : [ 'doprod' , 'stock' , 'consum' ] ,
'Stand.1' : list ( chain. from_iterable( θ1 ) ) , 'Original' : list ( chain. from_iterable( c1_β) ) ,
'Stand.2' : list ( chain. from_iterable( θ2 ) ) , 'Origina2' : list ( chain. from_iterable( c2_β) ) ,
'Stand.3' : list ( chain. from_iterable( θ3 ) ) , 'Origina3' : list ( chain. from_iterable( c3_β) ) } )
table2= table. T
table2[ 4 ] = [ 'constant' , 0 , c3_β0 [ 0 , 0 ] , 0 , - 5.64 , 0 , - 10.8 ]
table2. T
Variable Stand.1 Original Stand.2 Origina2 Stand.3 Origina3 0 doprod 0.49 0.07 0.48 0.07 -0.34 -0.05 1 stock 0.03 0.08 0.22 0.61 0.21 0.58 2 consum 0.49 0.11 0.48 0.11 1.3 0.29 4 constant 0 -10.817091 0 -5.64 0 -10.8
10.5 PRINCIPAL COMPONENTS REGRESSION: A CAUTION
table10.4-10.6
data= pd. read_csv( "C:/Users/可乐怪/Desktop/csv/P266.csv" )
n= len ( data)
meanVal= np. mean( data, axis= 0 )
CenteringMat= data- meanVal
StdVal= np. std( data, axis= 0 )
StdVal= ( StdVal* np. sqrt( n) ) / np. sqrt( n- 1 )
data1= CenteringMat/ StdVal
pca = PCA( n_components= 4 )
c_data= pca. fit_transform( data1. iloc[ : , 1 : 6 ] )
data2= pd. DataFrame( c_data)
data2. columns= [ "c1" , "c2" , "c3" , 'c4' ]
data2
c1 c2 c3 c4 0 1.467238 -1.903036 -0.530000 -0.038530 1 2.135829 -0.238354 -0.290186 0.029833 2 -1.129870 -0.183877 -0.010713 0.093701 3 0.659895 -1.576774 0.179204 0.033116 4 -0.358765 -0.483538 -0.740122 -0.019187 5 -0.966640 -0.169944 0.085702 0.012167 6 -0.930705 2.134817 -0.172986 -0.008295 7 2.232138 0.691671 0.459720 -0.022606 8 0.351516 1.432245 -0.031564 0.044988 9 -1.662543 -1.828097 0.851193 -0.019837 10 1.640180 1.295113 0.494178 -0.031389 11 -1.692594 0.392249 -0.019810 -0.037185 12 -1.745679 0.437525 -0.274615 -0.036776
model2= sm. OLS( data1[ 'Y' ] , data2) . fit( )
print ( model2. summary( ) )
OLS Regression Results
=======================================================================================
Dep. Variable: Y R-squared (uncentered): 0.982
Model: OLS Adj. R-squared (uncentered): 0.975
Method: Least Squares F-statistic: 125.4
Date: Thu, 25 Nov 2021 Prob (F-statistic): 6.94e-08
Time: 14:50:53 Log-Likelihood: 8.3241
No. Observations: 13 AIC: -8.648
Df Residuals: 9 BIC: -6.388
Df Model: 4
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
c1 -0.6570 0.030 -22.198 0.000 -0.724 -0.590
c2 0.0083 0.035 0.236 0.819 -0.071 0.088
c3 0.3028 0.102 2.956 0.016 0.071 0.535
c4 -0.3880 1.098 -0.353 0.732 -2.872 2.096
==============================================================================
Omnibus: 0.165 Durbin-Watson: 2.053
Prob(Omnibus): 0.921 Jarque-Bera (JB): 0.320
Skew: 0.201 Prob(JB): 0.852
Kurtosis: 2.345 Cond. No. 37.1
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
D:\Anaconda\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=13
warnings.warn("kurtosistest only valid for n>=20 ... continuing "
c1’coeff is highly significant and all other three coefficients are not significant.
model3= sm. OLS( data1[ 'Y' ] , data2. iloc[ : , 1 : ] ) . fit( )
print ( model3. summary( ) )
OLS Regression Results
=======================================================================================
Dep. Variable: Y R-squared (uncentered): 0.017
Model: OLS Adj. R-squared (uncentered): -0.277
Method: Least Squares F-statistic: 0.05923
Date: Thu, 25 Nov 2021 Prob (F-statistic): 0.980
Time: 14:50:53 Log-Likelihood: -17.811
No. Observations: 13 AIC: 41.62
Df Residuals: 10 BIC: 43.32
Df Model: 3
Covariance Type: nonrobust
==============================================================================
coef std err t P>|t| [0.025 0.975]
------------------------------------------------------------------------------
c2 0.0083 0.250 0.033 0.974 -0.548 0.565
c3 0.3028 0.726 0.417 0.685 -1.314 1.920
c4 -0.3880 7.779 -0.050 0.961 -17.720 16.944
==============================================================================
Omnibus: 2.733 Durbin-Watson: 2.031
Prob(Omnibus): 0.255 Jarque-Bera (JB): 1.199
Skew: -0.332 Prob(JB): 0.549
Kurtosis: 1.669 Cond. No. 31.2
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
D:\Anaconda\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=13
warnings.warn("kurtosistest only valid for n>=20 ... continuing "
R^2近似等于0 ,其他三个几乎不能解释U
Figure 10.1 Scatter plots of U versus each of the PCs of the Hald’s data.
plt. subplots_adjust( hspace= 0.4 , wspace= 0.4 )
plt. subplot( 221 )
plt. xlabel( 'c1' )
plt. ylabel( 'Y' )
plt. scatter( data2[ 'c1' ] , data1[ 'Y' ] , c= 'r' )
plt. subplot( 222 )
plt. xlabel( 'c2' )
plt. ylabel( 'Y' )
plt. scatter( data2[ 'c2' ] , data1[ 'Y' ] , c= 'b' )
plt. subplot( 223 )
plt. xlabel( 'c3' )
plt. ylabel( 'Y' )
plt. scatter( data2[ 'c3' ] , data1[ 'Y' ] , c= 'teal' )
plt. subplot( 224 )
plt. xlabel( 'c4' )
plt. ylabel( 'Y' )
plt. scatter( data2[ 'c4' ] , data1[ 'Y' ] , c= 'y' )
plt. suptitle( "Figure10.1" , size= 20 , c= "teal" )
plt. show( )
10.6 RIDGE REGRESSION
from sklearn. linear_model import Ridge
k= np. linspace( 0 , 1 , 29 )
y= s_data[ 'import' ]
x= s_data. iloc[ : , 2 : 5 ]
θ1 = [ ]
θ2 = [ ]
θ3 = [ ]
SSE= [ ]
for i in range ( len ( k) ) :
ridge = Ridge( k[ i] )
ridge = ridge . fit( x, y)
θ1. append( ridge. coef_[ 0 ] )
θ2. append( ridge. coef_[ 1 ] )
θ3. append( ridge. coef_[ 2 ] )
SSE. append( np. sum ( ( y- ridge. predict( x) ) ** 2 ) )
plt. plot( k, θ1 , 'r' , alpha= 0.5 , )
plt. plot( k, θ2 , 'b-.' , alpha= 0.5 )
plt. plot( k, θ3 , 'y+' )
plt. xlabel( 'k' , size= 15 )
plt. ylabel( 'θj(k)' , size= 15 )
plt. legend( [ 'θ1' , 'θ2' , 'θ3' ] , loc= 'upper right' )
plt. title( 'Figure 10.2 Ridge trace: IMPORT data (1949-1959).' , size= 20 , c= 'teal' )
plt. show( )
Table 10.8 Ridge Estimates θj(k),as Functions of the Ridge Parameter k, for the IMPORT Data (1949-1959)
θ_data= { 'k' : k, 'θ1(k)' : θ1 , 'θ2(k)' : θ2 , 'θ3(k)' : θ3 }
table= pd. DataFrame( θ_data)
table
k θ1(k) θ2(k) θ3(k) 0 0.000000 -0.339343 0.213048 1.302682 1 0.035714 0.119735 0.216875 0.841831 2 0.071429 0.248286 0.217448 0.711613 3 0.107143 0.308444 0.217351 0.649814 4 0.142857 0.343111 0.217007 0.613520 5 0.178571 0.365512 0.216547 0.589500 6 0.214286 0.381074 0.216023 0.572327 7 0.250000 0.392434 0.215463 0.559363 8 0.285714 0.401028 0.214880 0.549171 9 0.321429 0.407705 0.214282 0.540901 10 0.357143 0.413000 0.213675 0.534020 11 0.392857 0.417264 0.213062 0.528174 12 0.428571 0.420741 0.212445 0.523121 13 0.464286 0.423602 0.211827 0.518690 14 0.500000 0.425973 0.211207 0.514754 15 0.535714 0.427947 0.210588 0.511220 16 0.571429 0.429596 0.209970 0.508016 17 0.607143 0.430975 0.209353 0.505088 18 0.642857 0.432127 0.208738 0.502391 19 0.678571 0.433088 0.208125 0.499891 20 0.714286 0.433885 0.207514 0.497560 21 0.750000 0.434541 0.206905 0.495374 22 0.785714 0.435075 0.206299 0.493316 23 0.821429 0.435503 0.205696 0.491369 24 0.857143 0.435838 0.205096 0.489520 25 0.892857 0.436091 0.204499 0.487758 26 0.928571 0.436271 0.203904 0.486073 27 0.964286 0.436386 0.203313 0.484458 28 1.000000 0.436444 0.202724 0.482905
Table 10.9
vif1= [ ]
vif2= [ ]
vif3= [ ]
for i in range ( len ( k) ) :
m1= ( x. T) @x
m2= np. linalg. inv( m1 + k[ i] * np. eye( 3 ) )
m3= m2@m1@m2
vif1. append( np. diagonal( m3) [ 0 ] )
vif2. append( np. diagonal( m3) [ 1 ] )
vif3. append( np. diagonal( m3) [ 2 ] )
vif_data= { 'k' : k, 'SSE(k)' : SSE, 'vif1(k)' : vif1, 'vif2' : vif2, 'vif3' : vif3}
table= pd. DataFrame( vif_data)
table
k SSE(k) vif1(k) vif2 vif3 0 0.000000 0.084186 17.903500 0.098077 17.914333 1 0.035714 0.096049 3.470869 0.096014 3.472922 2 0.071429 0.103740 1.438630 0.095163 1.439447 3 0.107143 0.108132 0.789462 0.094454 0.789885 4 0.142857 0.111014 0.502711 0.093786 0.502960 5 0.178571 0.113120 0.351321 0.093138 0.351478 6 0.214286 0.114792 0.261766 0.092502 0.261869 7 0.250000 0.116209 0.204426 0.091876 0.204495 8 0.285714 0.117472 0.165506 0.091258 0.165551 9 0.321429 0.118646 0.137876 0.090647 0.137905 10 0.357143 0.119768 0.117550 0.090042 0.117567 11 0.392857 0.120866 0.102157 0.089445 0.102165 12 0.428571 0.121958 0.090216 0.088853 0.090216 13 0.464286 0.123057 0.080762 0.088268 0.080757 14 0.500000 0.124173 0.073146 0.087688 0.073137 15 0.535714 0.125313 0.066917 0.087115 0.066905 16 0.571429 0.126481 0.061756 0.086547 0.061741 17 0.607143 0.127682 0.057428 0.085984 0.057410 18 0.642857 0.128919 0.053761 0.085427 0.053742 19 0.678571 0.130194 0.050625 0.084876 0.050604 20 0.714286 0.131509 0.047921 0.084330 0.047899 21 0.750000 0.132865 0.045570 0.083789 0.045547 22 0.785714 0.134263 0.043513 0.083254 0.043489 23 0.821429 0.135704 0.041701 0.082723 0.041676 24 0.857143 0.137189 0.040096 0.082198 0.040070 25 0.892857 0.138717 0.038666 0.081677 0.038639 26 0.928571 0.140289 0.037385 0.081162 0.037358 27 0.964286 0.141905 0.036232 0.080651 0.036205 28 1.000000 0.143565 0.035191 0.080146 0.035163
Table10.10
k=0 coeff
r_θ1 = np. round ( Ridge( 0 ) . fit( x, y) . coef_, 2 )
r_θ1
array([-0.34, 0.21, 1.3 ])
coeff= [ StdVal[ 1 ] / StdVal[ i] for i in range ( 2 , 5 ) ]
r1_β= np. round ( np. array( coeff) * r_θ1 , 2 )
r1_β
array([-0.13, 0.19, 0.46])
r1_β0 = np. round ( meanVal[ 1 ] - r1_β@np. array( meanVal[ 2 : 5 ] ) , 2 )
r1_β0
-2.31
k=0.4 coeff
r_θ2 = np. round ( Ridge( 0.4 ) . fit( x, y) . coef_, 2 )
r_θ2
array([0.42, 0.21, 0.53])
coeff= [ StdVal[ 1 ] / StdVal[ i] for i in range ( 2 , 5 ) ]
r2_β= np. round ( np. array( coeff) * r_θ2 , 2 )
r2_β
array([0.16, 0.19, 0.19])
r2_β0 = np. round ( meanVal[ 1 ] - r2_β@np. array( meanVal[ 2 : 5 ] ) , 2 )
r2_β0
-8.18
table= pd. DataFrame( columns= [ 'Variable' , 'Stand.1' , ' Original' , 'Stand.2' , ' Origina2' ] )
table= pd. DataFrame( { 'Variable' : [ 'doprod' , 'stock' , 'consum' ] ,
'Stand.1' : r_θ1 , 'Original' : r1_β,
'Stand.2' : r_θ2 , 'Origina2' : r2_β} )
table2= table. T
table2[ 4 ] = [ 'constant' , 0 , r1_β0 , 0 , r2_β0 ]
table2. T
Variable Stand.1 Original Stand.2 Origina2 0 doprod -0.34 -0.13 0.42 0.16 1 stock 0.21 0.19 0.21 0.19 2 consum 1.3 0.46 0.53 0.19 4 constant 0 -2.31 0 -8.18