36.高阶回归与分类变量

目录

高阶回归

分类变量


高阶回归

# Y=5 + 2X + 3X^2
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
nsample = 50 # 样本数量
#linspace,返回间隔均匀的样本
x = np.linspace(0,10,nsample)
#column_stack,将一维数组转换为二维数组。
X = np.column_stack((x,x**2))
X = sm.add_constant(X)
X
array([[1.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.00000000e+00, 2.04081633e-01, 4.16493128e-02],
       [1.00000000e+00, 4.08163265e-01, 1.66597251e-01],
       [1.00000000e+00, 6.12244898e-01, 3.74843815e-01],
       [1.00000000e+00, 8.16326531e-01, 6.66389005e-01],
       [1.00000000e+00, 1.02040816e+00, 1.04123282e+00],
       [1.00000000e+00, 1.22448980e+00, 1.49937526e+00],
       [1.00000000e+00, 1.42857143e+00, 2.04081633e+00],
       [1.00000000e+00, 1.63265306e+00, 2.66555602e+00],
       [1.00000000e+00, 1.83673469e+00, 3.37359434e+00],
       [1.00000000e+00, 2.04081633e+00, 4.16493128e+00],
       [1.00000000e+00, 2.24489796e+00, 5.03956685e+00],
       [1.00000000e+00, 2.44897959e+00, 5.99750104e+00],
       [1.00000000e+00, 2.65306122e+00, 7.03873386e+00],
       [1.00000000e+00, 2.85714286e+00, 8.16326531e+00],
       [1.00000000e+00, 3.06122449e+00, 9.37109538e+00],
       [1.00000000e+00, 3.26530612e+00, 1.06622241e+01],
       [1.00000000e+00, 3.46938776e+00, 1.20366514e+01],
       [1.00000000e+00, 3.67346939e+00, 1.34943773e+01],
       [1.00000000e+00, 3.87755102e+00, 1.50354019e+01],
       [1.00000000e+00, 4.08163265e+00, 1.66597251e+01],
       [1.00000000e+00, 4.28571429e+00, 1.83673469e+01],
       [1.00000000e+00, 4.48979592e+00, 2.01582674e+01],
       [1.00000000e+00, 4.69387755e+00, 2.20324865e+01],
       [1.00000000e+00, 4.89795918e+00, 2.39900042e+01],
       [1.00000000e+00, 5.10204082e+00, 2.60308205e+01],
       [1.00000000e+00, 5.30612245e+00, 2.81549354e+01],
       [1.00000000e+00, 5.51020408e+00, 3.03623490e+01],
       [1.00000000e+00, 5.71428571e+00, 3.26530612e+01],
       [1.00000000e+00, 5.91836735e+00, 3.50270721e+01],
       [1.00000000e+00, 6.12244898e+00, 3.74843815e+01],
       [1.00000000e+00, 6.32653061e+00, 4.00249896e+01],
       [1.00000000e+00, 6.53061224e+00, 4.26488963e+01],
       [1.00000000e+00, 6.73469388e+00, 4.53561016e+01],
       [1.00000000e+00, 6.93877551e+00, 4.81466056e+01],
       [1.00000000e+00, 7.14285714e+00, 5.10204082e+01],
       [1.00000000e+00, 7.34693878e+00, 5.39775094e+01],
       [1.00000000e+00, 7.55102041e+00, 5.70179092e+01],
       [1.00000000e+00, 7.75510204e+00, 6.01416077e+01],
       [1.00000000e+00, 7.95918367e+00, 6.33486047e+01],
       [1.00000000e+00, 8.16326531e+00, 6.66389005e+01],
       [1.00000000e+00, 8.36734694e+00, 7.00124948e+01],
       [1.00000000e+00, 8.57142857e+00, 7.34693878e+01],
       [1.00000000e+00, 8.77551020e+00, 7.70095793e+01],
       [1.00000000e+00, 8.97959184e+00, 8.06330696e+01],
       [1.00000000e+00, 9.18367347e+00, 8.43398584e+01],
       [1.00000000e+00, 9.38775510e+00, 8.81299459e+01],
       [1.00000000e+00, 9.59183673e+00, 9.20033319e+01],
       [1.00000000e+00, 9.79591837e+00, 9.59600167e+01],
       [1.00000000e+00, 1.00000000e+01, 1.00000000e+02]])
bate = np.array([5,2,3])
e = np.random.normal(size=nsample)
y = np.dot(X,bate)+e

#最小二乘法,计算回归系数
model = sm.OLS(y,X)
result = model.fit()
result.params
array([4.91388904, 1.89789012, 3.01647615])
result.summary()
OLS Regression Results
Dep. Variable:yR-squared:1.000
Model:OLSAdj. R-squared:1.000
Method:Least SquaresF-statistic:2.449e+05
Date:Wed, 19 Sep 2018Prob (F-statistic):3.80e-95
Time:16:48:00Log-Likelihood:-68.540
No. Observations:50AIC:143.1
Df Residuals:47BIC:148.8
Df Model:2  
Covariance Type:nonrobust  
 coefstd errtP>|t|[0.0250.975]
const4.91390.40112.2570.0004.1075.720
x11.89790.18510.2360.0001.5252.271
x23.01650.018168.2400.0002.9803.053
Omnibus:0.197Durbin-Watson:2.100
Prob(Omnibus):0.906Jarque-Bera (JB):0.003
Skew:0.019Prob(JB):0.998
Kurtosis:3.011Cond. No.142.


Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

#拟合估计值
y_fitted = result.fittedvalues
#绘图
fig,ax = plt.subplots(figsize=(8,6))
ax.plot(x,y,'o',label='data')
ax.plot(x,y_fitted,'r--',label='OLS')
ax.legend(loc='best')
plt.show()

分类变量

#假设分类变量有3个取值(a,b,c),创建哑变量,比如考试成绩有3个等级a(1,0,0)b(0,1,0)c(0,0,1),这个时候需要3个系数β0,β1,β2,也就是要β0 * 0 + β1 * 1 + β2*2
#此方法为创建哑变量,有多少个取值,做多少个长度,对应值的位置为1,其余位置为0。
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
nsamples = 50
groups = np.zeros(nsamples,int)
groups[20:40] = 1 # 第二组值
groups[40:] = 2 #第三组值
print('groups:',groups)
dummy = sm.categorical(groups,drop=True)
print('dummy',dummy)
groups: [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 2 2 2 2 2 2 2 2 2 2]
dummy [[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 1. 0.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]
 [0. 0. 1.]]
# Y = 5 + 2X + 3Z1 + 6Z2 + 9Z3 + e
x = np.linspace(0,10,nsamples)
X = np.column_stack((x,dummy))
print(X)
X = sm.add_constant(X)
bata = [5,2,3,6,9]
e = np.random.normal(size=nsamples)
y = np.dot(X,bata) + e
[[ 0.          1.          0.          0.        ]
 [ 0.20408163  1.          0.          0.        ]
 [ 0.40816327  1.          0.          0.        ]
 [ 0.6122449   1.          0.          0.        ]
 [ 0.81632653  1.          0.          0.        ]
 [ 1.02040816  1.          0.          0.        ]
 [ 1.2244898   1.          0.          0.        ]
 [ 1.42857143  1.          0.          0.        ]
 [ 1.63265306  1.          0.          0.        ]
 [ 1.83673469  1.          0.          0.        ]
 [ 2.04081633  1.          0.          0.        ]
 [ 2.24489796  1.          0.          0.        ]
 [ 2.44897959  1.          0.          0.        ]
 [ 2.65306122  1.          0.          0.        ]
 [ 2.85714286  1.          0.          0.        ]
 [ 3.06122449  1.          0.          0.        ]
 [ 3.26530612  1.          0.          0.        ]
 [ 3.46938776  1.          0.          0.        ]
 [ 3.67346939  1.          0.          0.        ]
 [ 3.87755102  1.          0.          0.        ]
 [ 4.08163265  0.          1.          0.        ]
 [ 4.28571429  0.          1.          0.        ]
 [ 4.48979592  0.          1.          0.        ]
 [ 4.69387755  0.          1.          0.        ]
 [ 4.89795918  0.          1.          0.        ]
 [ 5.10204082  0.          1.          0.        ]
 [ 5.30612245  0.          1.          0.        ]
 [ 5.51020408  0.          1.          0.        ]
 [ 5.71428571  0.          1.          0.        ]
 [ 5.91836735  0.          1.          0.        ]
 [ 6.12244898  0.          1.          0.        ]
 [ 6.32653061  0.          1.          0.        ]
 [ 6.53061224  0.          1.          0.        ]
 [ 6.73469388  0.          1.          0.        ]
 [ 6.93877551  0.          1.          0.        ]
 [ 7.14285714  0.          1.          0.        ]
 [ 7.34693878  0.          1.          0.        ]
 [ 7.55102041  0.          1.          0.        ]
 [ 7.75510204  0.          1.          0.        ]
 [ 7.95918367  0.          1.          0.        ]
 [ 8.16326531  0.          0.          1.        ]
 [ 8.36734694  0.          0.          1.        ]
 [ 8.57142857  0.          0.          1.        ]
 [ 8.7755102   0.          0.          1.        ]
 [ 8.97959184  0.          0.          1.        ]
 [ 9.18367347  0.          0.          1.        ]
 [ 9.3877551   0.          0.          1.        ]
 [ 9.59183673  0.          0.          1.        ]
 [ 9.79591837  0.          0.          1.        ]
 [10.          0.          0.          1.        ]]
#回归分析
res = sm.OLS(y,X).fit()
res.summary()
OLS Regression Results
Dep. Variable:yR-squared:0.981
Model:OLSAdj. R-squared:0.980
Method:Least SquaresF-statistic:789.5
Date:Thu, 20 Sep 2018Prob (F-statistic):1.50e-39
Time:10:23:05Log-Likelihood:-76.762
No. Observations:50AIC:161.5
Df Residuals:46BIC:169.2
Df Model:3  
Covariance Type:nonrobust  
 coefstd errtP>|t|[0.0250.975]
const7.63130.66411.5010.0006.2968.967
x12.17540.15314.2470.0001.8682.483
x20.25560.4210.6070.547-0.5911.103
x32.28860.3526.5080.0001.5812.997
x45.08700.7926.4210.0003.4926.682
Omnibus:3.522Durbin-Watson:2.244
Prob(Omnibus):0.172Jarque-Bera (JB):2.958
Skew:-0.596Prob(JB):0.228
Kurtosis:3.026Cond. No.1.32e+17



Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The smallest eigenvalue is 1e-31. This might indicate that there are
strong multicollinearity problems or that the design matrix is singular.

#拟合估计值
y_fitted = res.fittedvalues
#绘图
fig,ax = plt.subplots(figsize=(8,6))
ax.plot(x,y,'o',label='data')
ax.plot(x,y_fitted,'r--',label='OLS')
ax.legend(loc='best')
plt.show()

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值