Ex1
table 9.3
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.stats.outliers_influence
from prettytable import PrettyTable
data['constant']=1
data['index']=range(1,71)
print(model.summary())
outliers=model.get_influence()
res=outliers.resid_studentized_internal
plt.xlabel("Predicted",size=15)
plt.ylabel("Residuals",size=15)
plt.title("Figure9.1",size=20,c="teal")
plt.axhline(y=0,ls="-.",c="r",alpha=0.5)
plt.axvline(x=0,ls="-.",c="r",alpha=0.5)
color=['r' if abs(i)>2.5 else 'b' for i in res]
for i in range(len(res)):
if abs(res[i])>2.5:
plt.annotate("(%s,%s)" % (data['index'][i],round(res[i],2)),xy=(predicted[i],res[i]),size=15)
plt.scatter(predicted,res,c=color)
f=f.ppf(0.99,3,66)
f
R^2=0.206;F= 5.717>4.093,
it is accepted that FAM, PEER, and SCHOOL are valid predictor variables. However, the individual t-valuesare all small. In total, the summary statistics say that the three predictors taken together are important but from the t-values, it follows that any one predictor may be deleted from the model provided the other two are retained.
extreme multicollinearity is present
Scatter plots of the three predictor variables
x_mean=np.mean(x)
y_mean=np.mean(y)
fenzi=0
for i in range(len(x)):
fenzi+=(x[i]-x_mean)*(y[i]-y_mean)
fenmu1=0
for i in range(len(x)):
fenmu1+=(x[i]-x_mean)**2
fenmu2=0
for i in range(len(x)):
fenmu2+=(y[i]-y_mean)**2
fenmu=(fenmu1*fenmu2)**0.5
cor=round(fenzi/fenmu,3)
return cor
plt.xticks([])
plt.yticks([])
plt.subplots_adjust(hspace=0,wspace=0)#wspace, hspace:子图之间的横向间距、纵向间距分别与子图平均宽度、平均高度的比值。
plt.text(0.3,0.4,'FAM',c='b',size=15)
plt.subplot(332)
plt.xticks([])
plt.yticks([])
plt.scatter(data['PEER'],data['FAM'],c='r',s=2)
plt.subplot(333)
plt.xticks([])
plt.yticks([])
plt.scatter(data['SCHOOL'],data['FAM'],s=3.5,c='r')
plt.subplot(334)
plt.xticks([])
plt.yticks([])
plt.text(0.3,0.4,cor(data['FAM'],data['PEER']),c='g',size=15)
plt.subplot(335)
plt.xticks([])
plt.yticks([])
plt.text(0.3,0.4,'PEER',c='b',size=15)
plt.subplot(336)
plt.xticks([])
plt.yticks([])
plt.scatter(data['SCHOOL'],data['PEER'],s=3.5,c='r')
plt.subplot(337)
plt.xticks([])
plt.yticks([])
plt.text(0.3,0.5,cor(data['FAM'],data['SCHOOL']),c='g',size=15)
plt.subplot(338)
plt.xticks([])
plt.yticks([])
plt.text(0.3,0.5,cor(data['PEER'],data['SCHOOL']),c='g',size=15)
plt.subplot(339)
plt.xticks([])
plt.yticks([])
plt.text(0.3,0.5,'SCHOOL',c='b',size=15)
plt.suptitle("Figure9.2",size=20,c="teal")
plt.show()
Ex2 aggregate data concerning import activity in the French economy
table9.6
1
data['constant']=1
data['index']=range(1,19)
y=data['import']
x=data.iloc[:,2:5]
x=x.values
y=y.values
ones=np.ones(shape=((X.shape)[0],1))
X=np.hstack([ones,X])
beta=(np.linalg.inv((X.T).dot(X))).dot(X.T).dot(Y)
y_yuce=X.dot(beta)
y_mean=np.mean(Y)
SSE=0
SST=0
length=np.shape(X)[0]
for i in range(length):
SSE+=((Y[i]-y_yuce[i])**2)
SST+=((Y[i]-y_mean)**2)
SST=np.round(SST,4)
SSE=np.round(SSE,4)
SSR=np.round(SST-SSE,4)
df_SSR=np.shape(X)[1]-1
df_SSE=np.shape(X)[0]-np.shape(X)[1]
MSR=np.round(SSR/df_SSR,4)
MSE=np.round(SSE/df_SSE,4)
F=np.round(MSR/MSE,4)
label=['source','sum of square','df','mean square']
table=PrettyTable(label)
table.add_row(['regression',SSR,df_SSR,MSR])
table.add_row(['residuals',SSE,df_SSE,MSE])
table.add_column('F',[F,'--'])
print(table)
return
y=data['import']
x=data.iloc[:,2:5]
x=x.values
y=y.values
F(y,x)
2
print(model.summary())
res=outliers.resid_studentized_internal
plt.plot(data['index'],res,'bo-')
plt.xlabel("Index",size=15)
plt.ylabel("Residuals",size=15)
plt.title("Figure9.3",size=20,c='teal')
plt.axhline(y=0,ls="-.",c="r",alpha=0.5)
plt.legend(['Import data (1949-1966)'],loc=(0.3, 0.9))
plt.show()
1.图9.3出现特别模式,模型不适合.multicollinearity is present(虽然R^2=0.973但t都很小)
2.根据资料我们去改进model如何构造model2
table 9.71___9.72
1
model2=sm.OLS(data2['import'],data2[['constant','doprod','stock','consum']]).fit()
print(model2.summary())
2
x=data2.iloc[:,2:5]
x=x.values
y=y.values
F(y,x)
res2=outliers2.resid_studentized_internal
plt.plot(data2['index'],res2,'bo-')
plt.xlabel("Index",size=15)
plt.ylabel("Residuals",size=15)
plt.title("Figure9.4",size=20,c='teal')
plt.axhline(y=0,ls="-.",c="r",alpha=0.5)
plt.legend(['Import data (1949-1959)'],loc=(0.3, 0.9))
plt.show()
图9.4**没有特别模式了纯随机,R^2=0.9
但是doprod的系数是负的并且无显著性与假设矛盾。推测multicollinearity is present
Regression Coefficients for All Possible Regressions
model2=sm.OLS(data2['import'],data2[['constant','stock']]).fit()
model3=sm.OLS(data2['import'],data2[['constant','consum']]).fit()
model4=sm.OLS(data2['import'],data2[['constant','doprod','stock']]).fit()
model5=sm.OLS(data2['import'],data2[['constant','doprod','consum']]).fit()
model6=sm.OLS(data2['import'],data2[['constant','stock','consum']]).fit()
model7=sm.OLS(data2['import'],data2[['constant','doprod','stock','consum']]).fit()
a1.insert(2,'--')
a1.insert(3,'--')
a1.insert(0,1)
a2=np.round(model2.params,3).values.tolist()
a2.insert(1,'--')
a2.insert(3,'--')
a2.insert(0,2)
a3=np.round(model3.params,3).values.tolist()
a3.insert(1,'--')
a3.insert(2,'--')
a3.insert(0,3)
a4=np.round(model4.params,3).values.tolist()
a4.insert(3,'--')
a4.insert(0,4)
a5=np.round(model5.params,3).values.tolist()
a5.insert(2,'--')
a5.insert(0,5)
a6=np.round(model6.params,3).values.tolist()
a6.insert(1,'--')
a6.insert(0,6)
a7=np.round(model7.params,3).values.tolist()
a7.insert(0,7)
table=PrettyTable(['Regression','Constant','DOPROD','STOCK','CONSUM'])
table.add_row(a1)
table.add_row(a2)
table.add_row(a3)
table.add_row(a4)
table.add_row(a5)
table.add_row(a6)
table.add_row(a7)
print('table 9.8')
print(table)
Ex3 Aggregate sales of a firm in period t
data['constant']=1
data['index']=range(1,23)
model=sm.OLS(data['s_t'],data[['constant','a_t','p_t','e_t','a_t1','p_t1']]).fit()
print(model.summary())
predicted=model.fittedvalues
outliers=model.get_influence()
res=outliers.resid_studentized_internal
plt.xlabel("Predicted",size=15)
plt.ylabel("Residuals",size=15)
plt.title("Figure9.5",size=20,c="teal")
plt.scatter(predicted,res,c='b')
plt.show()
res=outliers.resid_studentized_internal
plt.plot(data['index'],res,'bo-')
plt.xlabel("Index",size=15)
plt.ylabel("Residuals",size=15)
plt.title("Figure9.6",size=20,c='teal')
plt.axhline(y=0,ls="-.",c="r",alpha=0.5)
plt.show()
散点图和残差图没有任何模式随机分布
x=data.iloc[:,1:6]
x=x.values
y=y.values
F(y,x)
VIF
vif3=[]
for i in range(5):
a=round(variance_inflation_factor(data.iloc[:,1:7].values,i),2)
vif3.insert(i,a)
print(vif3)
table=pd.DataFrame({'vif1':vif1,'vif2':vif2,'vif3':vif3})
table