EX10 BIASED ESTIMATION OF REGRESSION COEFFICIENTS

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.stats.outliers_influence 
from prettytable import PrettyTable
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.decomposition import PCA

10.2 PRINCIPAL COMPONENTS REGRESSION

table 10.1-10.2

data=pd.read_csv("C:/Users/可乐怪/Desktop/csv/P229.csv")
n=len(data)
meanVal=np.mean(data.iloc[0:11,:],axis=0) #axis=0 , 求col 的平均
CenteringMat=data.iloc[0:11,:]-meanVal
StdVal=np.std(data.iloc[0:11,:],axis=0)
StdVal=(StdVal*np.sqrt(n))/np.sqrt(n-1)
s_data=CenteringMat/StdVal

s_data['constant']=1
model2=sm.OLS(s_data['import'],s_data[['constant','doprod','stock','consum']]).fit()
print(model2.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                 import   R-squared:                       0.992
Model:                            OLS   Adj. R-squared:                  0.988
Method:                 Least Squares   F-statistic:                     285.6
Date:                Thu, 25 Nov 2021   Prob (F-statistic):           1.11e-07
Time:                        14:50:53   Log-Likelihood:                 11.191
No. Observations:                  11   AIC:                            -14.38
Df Residuals:                       7   BIC:                            -12.79
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
constant    1.388e-16      0.033    4.2e-15      1.000      -0.078       0.078
doprod        -0.3393      0.464     -0.731      0.488      -1.437       0.758
stock          0.2130      0.034      6.203      0.000       0.132       0.294
consum         1.3027      0.464      2.807      0.026       0.205       2.400
==============================================================================
Omnibus:                        0.241   Durbin-Watson:                   2.740
Prob(Omnibus):                  0.887   Jarque-Bera (JB):                0.361
Skew:                           0.259   Prob(JB):                        0.835
Kurtosis:                       2.279   Cond. No.                         27.3
==============================================================================

Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.


D:\Anaconda\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=11
  warnings.warn("kurtosistest only valid for n>=20 ... continuing "
c_data3=-PCA(3).fit_transform(s_data.iloc[:,2:5])
data3=pd.DataFrame(c_data3)
data3.columns=["c1","c2","c3"]
model3=sm.OLS(s_data['import'],data3).fit()
print(model3.summary())
                                 OLS Regression Results                                
=======================================================================================
Dep. Variable:                 import   R-squared (uncentered):                   0.992
Model:                            OLS   Adj. R-squared (uncentered):              0.989
Method:                 Least Squares   F-statistic:                              326.4
Date:                Thu, 25 Nov 2021   Prob (F-statistic):                    1.06e-08
Time:                        14:50:53   Log-Likelihood:                          11.191
No. Observations:                  11   AIC:                                     -16.38
Df Residuals:                       8   BIC:                                     -15.19
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c1             0.6900      0.023     30.653      0.000       0.638       0.742
c2             0.1913      0.032      6.005      0.000       0.118       0.265
c3             1.1597      0.614      1.890      0.095      -0.255       2.574
==============================================================================
Omnibus:                        0.241   Durbin-Watson:                   2.740
Prob(Omnibus):                  0.887   Jarque-Bera (JB):                0.361
Skew:                           0.259   Prob(JB):                        0.835
Kurtosis:                       2.279   Cond. No.                         27.3
==============================================================================

Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.


D:\Anaconda\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=11
  warnings.warn("kurtosistest only valid for n>=20 ... continuing "

10.3 REMOVING DEPENDENCE AMONG THE PREDICTORS

First PC

from itertools import chain
c_data1=-PCA(1).fit_transform(s_data.iloc[:,2:5])
data1=pd.DataFrame(c_data1)
data1.columns=["c1"]
α1=np.array(sm.OLS(s_data['import'],data1).fit().params)
eigVals,eigVectors=np.linalg.eig(s_data.iloc[:,2:5].cov())
θ1=np.round((eigVectors.T[0]*α1).reshape(3,1),2)
# θ1=list(chain.from_iterable(θ1))
θ1
array([[0.49],
       [0.03],
       [0.49]])
coeff=[StdVal[1]/StdVal[i]  for i in range(2,5)]
c1_β=np.round(np.array(coeff).reshape(3,1)*θ1,2)
c1_β
array([[0.07],
       [0.08],
       [0.11]])
c1_β0=meanVal[1]-(np.array(meanVal[2:5]).reshape(1,3)@c1_β)
c1_β0[0,0]
-7.3654545454545435

First and Second

c_data2=-PCA(2).fit_transform(s_data.iloc[:,2:5])
data2=pd.DataFrame(c_data2)
data2.columns=["c1",'c2']
α2=np.array(sm.OLS(s_data['import'],data2).fit().params)
eigVectors.T[2]=-eigVectors.T[2]
θ=eigVectors.T[::2]*α2.reshape(2,1)
θ2=np.round((θ[0]+θ[1]).reshape(3,1),2)
θ2
array([[0.48],
       [0.22],
       [0.48]])
c2_β=np.round(np.array(coeff).reshape(3,1)*θ2,2)
c2_β
array([[0.07],
       [0.61],
       [0.11]])
c2_β0=meanVal[1]-(np.array(meanVal[2:5]).reshape(1,3)@c2_β)
c2_β0[0,0]
-9.114454545454542

All PCs

θ3=np.round(np.array(model2.params[1:]).reshape(3,1),2)
θ3
array([[-0.34],
       [ 0.21],
       [ 1.3 ]])
c3_β=np.round(np.array(coeff).reshape(3,1)*θ3,2)
c3_β
array([[-0.05],
       [ 0.58],
       [ 0.29]])
c3_β0=meanVal[1]-(np.array(meanVal[2:5]).reshape(1,3)@c3_β)
c3_β0[0,0]
-10.8170909090909
table=pd.DataFrame(columns=['Variable','Stand.1',' Original','Stand.2',' Origina2','Stand.3',' Origina3'])
table=pd.DataFrame({'Variable':['doprod','stock','consum'],
                    'Stand.1':list(chain.from_iterable(θ1)),'Original':list(chain.from_iterable(c1_β)),
                    'Stand.2':list(chain.from_iterable(θ2)),'Origina2':list(chain.from_iterable(c2_β)),
                    'Stand.3':list(chain.from_iterable(θ3)),'Origina3':list(chain.from_iterable(c3_β))})
table2=table.T
table2[4]=['constant',0,c3_β0[0,0],0,-5.64,0,-10.8]
table2.T
VariableStand.1OriginalStand.2Origina2Stand.3Origina3
0doprod0.490.070.480.07-0.34-0.05
1stock0.030.080.220.610.210.58
2consum0.490.110.480.111.30.29
4constant0-10.8170910-5.640-10.8

10.5 PRINCIPAL COMPONENTS REGRESSION: A CAUTION

table10.4-10.6

data=pd.read_csv("C:/Users/可乐怪/Desktop/csv/P266.csv")
n=len(data)
meanVal=np.mean(data,axis=0) #axis=0 , 求col 的平均
CenteringMat=data-meanVal
StdVal=np.std(data,axis=0)
StdVal=(StdVal*np.sqrt(n))/np.sqrt(n-1)
data1=CenteringMat/StdVal

pca = PCA(n_components=4)
c_data=pca.fit_transform(data1.iloc[:,1:6])
data2=pd.DataFrame(c_data)
data2.columns=["c1","c2","c3",'c4']
data2
c1c2c3c4
01.467238-1.903036-0.530000-0.038530
12.135829-0.238354-0.2901860.029833
2-1.129870-0.183877-0.0107130.093701
30.659895-1.5767740.1792040.033116
4-0.358765-0.483538-0.740122-0.019187
5-0.966640-0.1699440.0857020.012167
6-0.9307052.134817-0.172986-0.008295
72.2321380.6916710.459720-0.022606
80.3515161.432245-0.0315640.044988
9-1.662543-1.8280970.851193-0.019837
101.6401801.2951130.494178-0.031389
11-1.6925940.392249-0.019810-0.037185
12-1.7456790.437525-0.274615-0.036776
model2=sm.OLS(data1['Y'],data2).fit()
print(model2.summary())
                                 OLS Regression Results                                
=======================================================================================
Dep. Variable:                      Y   R-squared (uncentered):                   0.982
Model:                            OLS   Adj. R-squared (uncentered):              0.975
Method:                 Least Squares   F-statistic:                              125.4
Date:                Thu, 25 Nov 2021   Prob (F-statistic):                    6.94e-08
Time:                        14:50:53   Log-Likelihood:                          8.3241
No. Observations:                  13   AIC:                                     -8.648
Df Residuals:                       9   BIC:                                     -6.388
Df Model:                           4                                                  
Covariance Type:            nonrobust                                                  
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c1            -0.6570      0.030    -22.198      0.000      -0.724      -0.590
c2             0.0083      0.035      0.236      0.819      -0.071       0.088
c3             0.3028      0.102      2.956      0.016       0.071       0.535
c4            -0.3880      1.098     -0.353      0.732      -2.872       2.096
==============================================================================
Omnibus:                        0.165   Durbin-Watson:                   2.053
Prob(Omnibus):                  0.921   Jarque-Bera (JB):                0.320
Skew:                           0.201   Prob(JB):                        0.852
Kurtosis:                       2.345   Cond. No.                         37.1
==============================================================================

Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.


D:\Anaconda\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=13
  warnings.warn("kurtosistest only valid for n>=20 ... continuing "

c1’coeff is highly significant and all other three coefficients are not significant.

model3=sm.OLS(data1['Y'],data2.iloc[:,1:]).fit()
print(model3.summary())
                                 OLS Regression Results                                
=======================================================================================
Dep. Variable:                      Y   R-squared (uncentered):                   0.017
Model:                            OLS   Adj. R-squared (uncentered):             -0.277
Method:                 Least Squares   F-statistic:                            0.05923
Date:                Thu, 25 Nov 2021   Prob (F-statistic):                       0.980
Time:                        14:50:53   Log-Likelihood:                         -17.811
No. Observations:                  13   AIC:                                      41.62
Df Residuals:                      10   BIC:                                      43.32
Df Model:                           3                                                  
Covariance Type:            nonrobust                                                  
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
c2             0.0083      0.250      0.033      0.974      -0.548       0.565
c3             0.3028      0.726      0.417      0.685      -1.314       1.920
c4            -0.3880      7.779     -0.050      0.961     -17.720      16.944
==============================================================================
Omnibus:                        2.733   Durbin-Watson:                   2.031
Prob(Omnibus):                  0.255   Jarque-Bera (JB):                1.199
Skew:                          -0.332   Prob(JB):                        0.549
Kurtosis:                       1.669   Cond. No.                         31.2
==============================================================================

Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.


D:\Anaconda\lib\site-packages\scipy\stats\stats.py:1603: UserWarning: kurtosistest only valid for n>=20 ... continuing anyway, n=13
  warnings.warn("kurtosistest only valid for n>=20 ... continuing "

R^2近似等于0,其他三个几乎不能解释U

Figure 10.1 Scatter plots of U versus each of the PCs of the Hald’s data.

plt.subplots_adjust(hspace=0.4,wspace=0.4)
plt.subplot(221)
plt.xlabel('c1')
plt.ylabel('Y')
plt.scatter(data2['c1'],data1['Y'],c='r')

plt.subplot(222)
plt.xlabel('c2')
plt.ylabel('Y')
plt.scatter(data2['c2'],data1['Y'],c='b')

plt.subplot(223)
plt.xlabel('c3')
plt.ylabel('Y')
plt.scatter(data2['c3'],data1['Y'],c='teal')

plt.subplot(224)
plt.xlabel('c4')
plt.ylabel('Y')
plt.scatter(data2['c4'],data1['Y'],c='y')
plt.suptitle("Figure10.1",size=20,c="teal")
plt.show()

在这里插入图片描述

10.6 RIDGE REGRESSION

from sklearn.linear_model import Ridge
k=np.linspace(0,1,29)#(start, end, num=num_points)使k属于[0,1]
y=s_data['import']
x=s_data.iloc[:,2:5]
θ1=[]
θ2=[]
θ3=[]
SSE=[]
for  i  in range(len(k)):
    ridge = Ridge(k[i])
    ridge =ridge .fit(x, y)
    θ1.append(ridge.coef_[0])
    θ2.append(ridge.coef_[1])
    θ3.append(ridge.coef_[2])
    SSE.append(np.sum((y- ridge.predict(x))**2))

plt.plot(k,θ1,'r',alpha=0.5,)
plt.plot(k,θ2,'b-.',alpha=0.5)
plt.plot(k,θ3,'y+')
plt.xlabel('k',size=15)
plt.ylabel('θj(k)',size=15)
plt.legend(['θ1', 'θ2', 'θ3'], loc='upper right')
plt.title('Figure 10.2 Ridge trace: IMPORT data (1949-1959).',size=20,c='teal')
plt.show()

在这里插入图片描述

Table 10.8 Ridge Estimates θj(k),as Functions of the Ridge Parameter k, for the IMPORT Data (1949-1959)

θ_data={'k':k,'θ1(k)':θ1,'θ2(k)':θ2,'θ3(k)':θ3}
table=pd.DataFrame(θ_data)
table
kθ1(k)θ2(k)θ3(k)
00.000000-0.3393430.2130481.302682
10.0357140.1197350.2168750.841831
20.0714290.2482860.2174480.711613
30.1071430.3084440.2173510.649814
40.1428570.3431110.2170070.613520
50.1785710.3655120.2165470.589500
60.2142860.3810740.2160230.572327
70.2500000.3924340.2154630.559363
80.2857140.4010280.2148800.549171
90.3214290.4077050.2142820.540901
100.3571430.4130000.2136750.534020
110.3928570.4172640.2130620.528174
120.4285710.4207410.2124450.523121
130.4642860.4236020.2118270.518690
140.5000000.4259730.2112070.514754
150.5357140.4279470.2105880.511220
160.5714290.4295960.2099700.508016
170.6071430.4309750.2093530.505088
180.6428570.4321270.2087380.502391
190.6785710.4330880.2081250.499891
200.7142860.4338850.2075140.497560
210.7500000.4345410.2069050.495374
220.7857140.4350750.2062990.493316
230.8214290.4355030.2056960.491369
240.8571430.4358380.2050960.489520
250.8928570.4360910.2044990.487758
260.9285710.4362710.2039040.486073
270.9642860.4363860.2033130.484458
281.0000000.4364440.2027240.482905

Table 10.9

vif1= []
vif2= []
vif3= []
for i in range(len(k)):
    m1=(x.T)@x
    m2=np.linalg.inv(m1 + k[i] * np.eye(3))
    m3=m2@m1@m2
    vif1.append( np.diagonal(m3)[0])
    vif2.append( np.diagonal(m3)[1])
    vif3.append( np.diagonal(m3)[2])
vif_data={'k':k,'SSE(k)':SSE,'vif1(k)':vif1,'vif2':vif2,'vif3':vif3}
table=pd.DataFrame(vif_data)
table
kSSE(k)vif1(k)vif2vif3
00.0000000.08418617.9035000.09807717.914333
10.0357140.0960493.4708690.0960143.472922
20.0714290.1037401.4386300.0951631.439447
30.1071430.1081320.7894620.0944540.789885
40.1428570.1110140.5027110.0937860.502960
50.1785710.1131200.3513210.0931380.351478
60.2142860.1147920.2617660.0925020.261869
70.2500000.1162090.2044260.0918760.204495
80.2857140.1174720.1655060.0912580.165551
90.3214290.1186460.1378760.0906470.137905
100.3571430.1197680.1175500.0900420.117567
110.3928570.1208660.1021570.0894450.102165
120.4285710.1219580.0902160.0888530.090216
130.4642860.1230570.0807620.0882680.080757
140.5000000.1241730.0731460.0876880.073137
150.5357140.1253130.0669170.0871150.066905
160.5714290.1264810.0617560.0865470.061741
170.6071430.1276820.0574280.0859840.057410
180.6428570.1289190.0537610.0854270.053742
190.6785710.1301940.0506250.0848760.050604
200.7142860.1315090.0479210.0843300.047899
210.7500000.1328650.0455700.0837890.045547
220.7857140.1342630.0435130.0832540.043489
230.8214290.1357040.0417010.0827230.041676
240.8571430.1371890.0400960.0821980.040070
250.8928570.1387170.0386660.0816770.038639
260.9285710.1402890.0373850.0811620.037358
270.9642860.1419050.0362320.0806510.036205
281.0000000.1435650.0351910.0801460.035163

Table10.10

k=0 coeff

r_θ1=np.round(Ridge(0).fit(x,y).coef_,2)
r_θ1
array([-0.34,  0.21,  1.3 ])
coeff=[StdVal[1]/StdVal[i]  for i in range(2,5)]
r1_β=np.round(np.array(coeff)*r_θ1,2)
r1_β
array([-0.13,  0.19,  0.46])
r1_β0=np.round(meanVal[1]-r1_β@np.array(meanVal[2:5]),2)
r1_β0
-2.31

k=0.4 coeff

r_θ2=np.round(Ridge(0.4).fit(x,y).coef_,2)
r_θ2
array([0.42, 0.21, 0.53])
coeff=[StdVal[1]/StdVal[i]  for i in range(2,5)]
r2_β=np.round(np.array(coeff)*r_θ2,2)
r2_β
array([0.16, 0.19, 0.19])
r2_β0=np.round(meanVal[1]-r2_β@np.array(meanVal[2:5]),2)
r2_β0
-8.18
table=pd.DataFrame(columns=['Variable','Stand.1',' Original','Stand.2',' Origina2'])
table=pd.DataFrame({'Variable':['doprod','stock','consum'],
                    'Stand.1':r_θ1,'Original':r1_β,
                    'Stand.2':r_θ2,'Origina2':r2_β})
table2=table.T
table2[4]=['constant',0,r1_β0,0,r2_β0]
table2.T
VariableStand.1OriginalStand.2Origina2
0doprod-0.34-0.130.420.16
1stock0.210.190.210.19
2consum1.30.460.530.19
4constant0-2.310-8.18



  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值