构造随机数组:
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
nsample = 20
x = np.linspace(0, 10, nsample)
x
array([ 0. , 0.52631579, 1.05263158, 1.57894737, 2.10526316,
2.63157895, 3.15789474, 3.68421053, 4.21052632, 4.73684211,
5.26315789, 5.78947368, 6.31578947, 6.84210526, 7.36842105,
7.89473684, 8.42105263, 8.94736842, 9.47368421, 10. ])
一元线性回归:
X = sm.add_constant(x)
X
array([[ 1. , 0. ],
[ 1. , 0.52631579],
[ 1. , 1.05263158],
[ 1. , 1.57894737],
[ 1. , 2.10526316],
[ 1. , 2.63157895],
[ 1. , 3.15789474],
[ 1. , 3.68421053],
[ 1. , 4.21052632],
[ 1. , 4.73684211],
[ 1. , 5.26315789],
[ 1. , 5.78947368],
[ 1. , 6.31578947],
[ 1. , 6.84210526],
[ 1. , 7.36842105],
[ 1. , 7.89473684],
[ 1. , 8.42105263],
[ 1. , 8.94736842],
[ 1. , 9.47368421],
[ 1. , 10. ]])
#β0,β1分别设置成2,5
beta = np.array([2, 5])
beta
array([2, 5])
#误差项
e = np.random.normal(size=nsample)
e
array([-1.87955777, -0.49196542, -0.53259826, -0.10675137, 1.17508727,
-1.02293335, 1.75717793, -1.12456907, -0.64294355, 0.44920794,
-1.20471093, 0.02588779, -0.69216987, 1.23018605, 0.59180637,
1.03269883, 0.16712902, 2.06599762, -2.06133608, -0.12899139])
#实际值y
y = np.dot(X, beta) + e
y
array([ 0.12044223, 4.13961353, 6.73055964, 9.78798547,
13.70140306, 14.13496139, 19.54665161, 19.29648356,
22.40968802, 26.13341847, 27.11107855, 30.97325622,
32.8867775 , 37.44071236, 39.43391164, 42.50638304,
44.27239218, 48.80283972, 47.30708497, 51.87100861])
#最小二乘法
model = sm.OLS(y,X)
#拟合数据
res = model.fit()
#回归系数
res.params
#全部结果
res.summary()
#拟合的估计值
y_ = res.fittedvalues
y_
array([ 1.49524076, 4.17261885, 6.84999693, 9.52737502,
12.20475311, 14.8821312 , 17.55950928, 20.23688737,
22.91426546, 25.59164354, 28.26902163, 30.94639972,
33.62377781, 36.30115589, 38.97853398, 41.65591207,
44.33329015, 47.01066824, 49.68804633, 52.36542442])
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label='data')#原始数据
ax.plot(x, y_, 'r--.',label='test')#拟合数据
ax.legend(loc='best')
plt.show()
高阶回归
#Y=5+2⋅X+3⋅X^2
nsample = 50
x = np.linspace(0, 10, nsample)
X = np.column_stack((x, x**2))
X = sm.add_constant(X)
beta = np.array([5, 2, 3])
e = np.random.normal(size=nsample)
y = np.dot(X, beta) + e
model = sm.OLS(y,X)
results = model.fit()
results.params
算出的β0,β1,β2为:
array([5.6633623 , 1.56636248, 3.04483462])
results.summary()
y_fitted = results.fittedvalues
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label='data')
ax.plot(x, y_fitted, 'r--.',label='OLS')
ax.legend(loc='best')
plt.show()
分类变量
假设分类变量有4个取值(a,b,c),比如考试成绩有3个等级。那么a就是(1,0,0),b(0,1,0),c(0,0,1),这个时候就需要3个系数β0,β1,β2,也就是β0x0+β1x1+β2x2
nsample = 50
groups = np.zeros(nsample,int)
groups
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0])
groups[20:40] = 1
groups[40:] = 2
dummy = sm.categorical(groups, drop=True)
dummy
array([[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[1., 0., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 1., 0.],
[0., 0., 1.],
[0., 0., 1.],
[0., 0., 1.],
[0., 0., 1.],
[0., 0., 1.],
[0., 0., 1.],
[0., 0., 1.],
[0., 0., 1.],
[0., 0., 1.],
[0., 0., 1.]])
#Y=5+2X+3Z1+6⋅Z2+9⋅Z3.
x = np.linspace(0, 20, nsample)
X = np.column_stack((x, dummy))
X = sm.add_constant(X)
beta = [5, 2, 3, 6, 9]
e = np.random.normal(size=nsample)
y = np.dot(X, beta) + e
result = sm.OLS(y,X).fit()
result.summary()
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label="data")
ax.plot(x, result.fittedvalues, 'r--.', label="OLS")
ax.legend(loc='best')
plt.show()
ModuleNotFoundError: No module named 'plotly'
在Anaconda Prompt(Anaconda)中安装plotly
pip install plotly
输入conda list查看是否安装成功。
conda list
身高体重案例:
所需数据:
链接:https://pan.baidu.com/s/1yqgu5mNxDz188z8hN6uuGA
提取码:8888
需要安装plotly
import pandas as pd
import statsmodels.api as sm
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)
#Read CSV data
data = pd.read_csv('weight.csv')
data = data[data.height > 120]
画出散点图:
trace = go.Scatter(x=data.height, y=data.weight, mode='markers')
layout = go.Layout(
width=900,
height=500,
xaxis=dict(title='Height',titlefont=dict(family='Consolas, monospace',size=15)),
yaxis=dict(title='Weight',titlefont=dict(family='Consolas, monospace',size=15))
)
data2 = [trace]
fig = go.Figure(data=data2, layout=layout)
iplot(fig)
#Run linear regression
X = data.height
y = data.weight
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
model.summary()
R2的值只有0.594,拟合效果相对较差
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(data.height, y, 'o', label="data")
ax.plot(data.height, model.fittedvalues, 'r--.', label="OLS")
ax.legend(loc='best')
plt.show()