回归分析python

构造随机数组:

import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

nsample = 20
x = np.linspace(0, 10, nsample)
x
array([ 0.        ,  0.52631579,  1.05263158,  1.57894737,  2.10526316,
        2.63157895,  3.15789474,  3.68421053,  4.21052632,  4.73684211,
        5.26315789,  5.78947368,  6.31578947,  6.84210526,  7.36842105,
        7.89473684,  8.42105263,  8.94736842,  9.47368421, 10.        ])

一元线性回归:

X = sm.add_constant(x)
X
array([[ 1.        ,  0.        ],
       [ 1.        ,  0.52631579],
       [ 1.        ,  1.05263158],
       [ 1.        ,  1.57894737],
       [ 1.        ,  2.10526316],
       [ 1.        ,  2.63157895],
       [ 1.        ,  3.15789474],
       [ 1.        ,  3.68421053],
       [ 1.        ,  4.21052632],
       [ 1.        ,  4.73684211],
       [ 1.        ,  5.26315789],
       [ 1.        ,  5.78947368],
       [ 1.        ,  6.31578947],
       [ 1.        ,  6.84210526],
       [ 1.        ,  7.36842105],
       [ 1.        ,  7.89473684],
       [ 1.        ,  8.42105263],
       [ 1.        ,  8.94736842],
       [ 1.        ,  9.47368421],
       [ 1.        , 10.        ]])
#β0,β1分别设置成2,5
beta = np.array([2, 5])
beta
array([2, 5])
#误差项
e = np.random.normal(size=nsample)
e
array([-1.87955777, -0.49196542, -0.53259826, -0.10675137,  1.17508727,
       -1.02293335,  1.75717793, -1.12456907, -0.64294355,  0.44920794,
       -1.20471093,  0.02588779, -0.69216987,  1.23018605,  0.59180637,
        1.03269883,  0.16712902,  2.06599762, -2.06133608, -0.12899139])
#实际值y
y = np.dot(X, beta) + e
y
array([  0.12044223,   4.13961353,   6.73055964,   9.78798547,
        13.70140306,  14.13496139,  19.54665161,  19.29648356,
        22.40968802,  26.13341847,  27.11107855,  30.97325622,
        32.8867775 ,  37.44071236,  39.43391164,  42.50638304,
        44.27239218,  48.80283972,  47.30708497,  51.87100861])
#最小二乘法
model = sm.OLS(y,X)
#拟合数据
res = model.fit()
#回归系数
res.params
#全部结果
res.summary()

#拟合的估计值
y_ = res.fittedvalues
y_
array([  1.49524076,   4.17261885,   6.84999693,   9.52737502,
        12.20475311,  14.8821312 ,  17.55950928,  20.23688737,
        22.91426546,  25.59164354,  28.26902163,  30.94639972,
        33.62377781,  36.30115589,  38.97853398,  41.65591207,
        44.33329015,  47.01066824,  49.68804633,  52.36542442])
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label='data')#原始数据
ax.plot(x, y_, 'r--.',label='test')#拟合数据
ax.legend(loc='best')
plt.show()

              

 高阶回归

#Y=5+2⋅X+3⋅X^2
 
nsample = 50
x = np.linspace(0, 10, nsample)
X = np.column_stack((x, x**2))
X = sm.add_constant(X)
beta = np.array([5, 2, 3])
e = np.random.normal(size=nsample)
y = np.dot(X, beta) + e
model = sm.OLS(y,X)
results = model.fit()
results.params

 算出的β0,β1,β2为:

array([5.6633623 , 1.56636248, 3.04483462])
results.summary()

y_fitted = results.fittedvalues
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label='data')
ax.plot(x, y_fitted, 'r--.',label='OLS')
ax.legend(loc='best')
plt.show()

分类变量

假设分类变量有4个取值(a,b,c),比如考试成绩有3个等级。那么a就是(1,0,0),b(0,1,0),c(0,0,1),这个时候就需要3个系数β0,β1,β2,也就是β0x0+β1x1+β2x2

nsample = 50
groups = np.zeros(nsample,int)
groups
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])
groups[20:40] = 1
groups[40:] = 2
dummy = sm.categorical(groups, drop=True)
dummy
array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])
#Y=5+2X+3Z1+6⋅Z2+9⋅Z3.
 
x = np.linspace(0, 20, nsample)
X = np.column_stack((x, dummy))
X = sm.add_constant(X)
beta = [5, 2, 3, 6, 9]
e = np.random.normal(size=nsample)
y = np.dot(X, beta) + e
result = sm.OLS(y,X).fit()
result.summary()

fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label="data")
ax.plot(x, result.fittedvalues, 'r--.', label="OLS")
ax.legend(loc='best')
plt.show()

  

 ModuleNotFoundError: No module named 'plotly'

 在Anaconda Prompt(Anaconda)中安装plotly

 

pip install plotly

 输入conda list查看是否安装成功。

conda list

身高体重案例:

所需数据:

 链接:https://pan.baidu.com/s/1yqgu5mNxDz188z8hN6uuGA 
提取码:8888

 需要安装plotly

import pandas as pd
import statsmodels.api as sm
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)
#Read CSV data
data = pd.read_csv('weight.csv')
data = data[data.height > 120]

画出散点图: 

trace = go.Scatter(x=data.height, y=data.weight, mode='markers')

layout = go.Layout(
    width=900,
    height=500,
    xaxis=dict(title='Height',titlefont=dict(family='Consolas, monospace',size=15)),
    yaxis=dict(title='Weight',titlefont=dict(family='Consolas, monospace',size=15))
)

data2 = [trace]
fig = go.Figure(data=data2, layout=layout)

iplot(fig)

#Run linear regression
X = data.height
y = data.weight
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

model.summary()

 R2的值只有0.594,拟合效果相对较差

import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(data.height, y, 'o', label="data")
ax.plot(data.height, model.fittedvalues, 'r--.', label="OLS")
ax.legend(loc='best')
plt.show()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

长沙有肥鱼

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值