回归分析python

长沙有肥鱼

已于 2022-02-12 16:00:35 修改

阅读量1.1k

点赞数 1

分类专栏： AI数学文章标签：回归 python 机器学习

于 2022-02-12 15:58:56 首次发布

本文链接：https://blog.csdn.net/weixin_53660567/article/details/122896584

版权

AI数学专栏收录该内容

20 篇文章 17 订阅

订阅专栏

构造随机数组：

import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

nsample = 20
x = np.linspace(0, 10, nsample)
x

array([ 0.        ,  0.52631579,  1.05263158,  1.57894737,  2.10526316,
        2.63157895,  3.15789474,  3.68421053,  4.21052632,  4.73684211,
        5.26315789,  5.78947368,  6.31578947,  6.84210526,  7.36842105,
        7.89473684,  8.42105263,  8.94736842,  9.47368421, 10.        ])

一元线性回归：

X = sm.add_constant(x)
X

array([[ 1.        ,  0.        ],
       [ 1.        ,  0.52631579],
       [ 1.        ,  1.05263158],
       [ 1.        ,  1.57894737],
       [ 1.        ,  2.10526316],
       [ 1.        ,  2.63157895],
       [ 1.        ,  3.15789474],
       [ 1.        ,  3.68421053],
       [ 1.        ,  4.21052632],
       [ 1.        ,  4.73684211],
       [ 1.        ,  5.26315789],
       [ 1.        ,  5.78947368],
       [ 1.        ,  6.31578947],
       [ 1.        ,  6.84210526],
       [ 1.        ,  7.36842105],
       [ 1.        ,  7.89473684],
       [ 1.        ,  8.42105263],
       [ 1.        ,  8.94736842],
       [ 1.        ,  9.47368421],
       [ 1.        , 10.        ]])

#β0,β1分别设置成2,5
beta = np.array([2, 5])
beta

array([2, 5])

#误差项
e = np.random.normal(size=nsample)
e

array([-1.87955777, -0.49196542, -0.53259826, -0.10675137,  1.17508727,
       -1.02293335,  1.75717793, -1.12456907, -0.64294355,  0.44920794,
       -1.20471093,  0.02588779, -0.69216987,  1.23018605,  0.59180637,
        1.03269883,  0.16712902,  2.06599762, -2.06133608, -0.12899139])

#实际值y
y = np.dot(X, beta) + e
y

array([  0.12044223,   4.13961353,   6.73055964,   9.78798547,
        13.70140306,  14.13496139,  19.54665161,  19.29648356,
        22.40968802,  26.13341847,  27.11107855,  30.97325622,
        32.8867775 ,  37.44071236,  39.43391164,  42.50638304,
        44.27239218,  48.80283972,  47.30708497,  51.87100861])

#最小二乘法
model = sm.OLS(y,X)

#拟合数据
res = model.fit()

#回归系数
res.params

#全部结果
res.summary()

#拟合的估计值
y_ = res.fittedvalues
y_

array([  1.49524076,   4.17261885,   6.84999693,   9.52737502,
        12.20475311,  14.8821312 ,  17.55950928,  20.23688737,
        22.91426546,  25.59164354,  28.26902163,  30.94639972,
        33.62377781,  36.30115589,  38.97853398,  41.65591207,
        44.33329015,  47.01066824,  49.68804633,  52.36542442])

fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label='data')#原始数据
ax.plot(x, y_, 'r--.',label='test')#拟合数据
ax.legend(loc='best')
plt.show()

高阶回归

#Y=5+2⋅X+3⋅X^2
 
nsample = 50
x = np.linspace(0, 10, nsample)
X = np.column_stack((x, x**2))
X = sm.add_constant(X)

beta = np.array([5, 2, 3])
e = np.random.normal(size=nsample)
y = np.dot(X, beta) + e
model = sm.OLS(y,X)
results = model.fit()
results.params

算出的β0,β1,β2为：

array([5.6633623 , 1.56636248, 3.04483462])

results.summary()

y_fitted = results.fittedvalues
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label='data')
ax.plot(x, y_fitted, 'r--.',label='OLS')
ax.legend(loc='best')
plt.show()

分类变量

假设分类变量有4个取值（a,b,c）,比如考试成绩有3个等级。那么a就是（1,0,0），b（0,1,0），c(0,0,1),这个时候就需要3个系数β0,β1,β2，也就是β0x0+β1x1+β2x2

nsample = 50
groups = np.zeros(nsample,int)
groups

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

groups[20:40] = 1
groups[40:] = 2
dummy = sm.categorical(groups, drop=True)
dummy

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

#Y=5+2X+3Z1+6⋅Z2+9⋅Z3.
 
x = np.linspace(0, 20, nsample)
X = np.column_stack((x, dummy))
X = sm.add_constant(X)
beta = [5, 2, 3, 6, 9]
e = np.random.normal(size=nsample)
y = np.dot(X, beta) + e
result = sm.OLS(y,X).fit()
result.summary()

fig, ax = plt.subplots(figsize=(8,6))
ax.plot(x, y, 'o', label="data")
ax.plot(x, result.fittedvalues, 'r--.', label="OLS")
ax.legend(loc='best')
plt.show()

ModuleNotFoundError: No module named 'plotly'

在Anaconda Prompt(Anaconda)中安装plotly

pip install plotly

输入conda list查看是否安装成功。

conda list

身高体重案例：

所需数据：

链接：https://pan.baidu.com/s/1yqgu5mNxDz188z8hN6uuGA
提取码：8888

需要安装plotly

import pandas as pd
import statsmodels.api as sm
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go

init_notebook_mode(connected=True)

#Read CSV data
data = pd.read_csv('weight.csv')
data = data[data.height > 120]

画出散点图：

trace = go.Scatter(x=data.height, y=data.weight, mode='markers')

layout = go.Layout(
    width=900,
    height=500,
    xaxis=dict(title='Height',titlefont=dict(family='Consolas, monospace',size=15)),
    yaxis=dict(title='Weight',titlefont=dict(family='Consolas, monospace',size=15))
)

data2 = [trace]
fig = go.Figure(data=data2, layout=layout)

iplot(fig)

#Run linear regression
X = data.height
y = data.weight
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()

model.summary()

R2的值只有0.594，拟合效果相对较差

import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8,6))
ax.plot(data.height, y, 'o', label="data")
ax.plot(data.height, model.fittedvalues, 'r--.', label="OLS")
ax.legend(loc='best')
plt.show()