# scikit-learn : 线性回归

# 线性回归背景 从线性回归(Linear regression)开始学习回归分析，线性回归是最早的也是最基本的模型——把数据拟合成一条直线。 — # 数据集 使用scikit-learn里的数据集boston,boston数据集很适合用来演示线性回归。boston数据集包含了波士顿地区的房屋价格中位数。还有一些可能会影响房价的因素，比如犯罪率（crime rate）。 ## 加载数据
from sklearn import datasets
boston = datasets.load_boston()
## 数据可视化
import pandas as pd
import warnings # 用来忽略seaborn绘图库产生的warnings
warnings.filterwarnings("ignore")
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="white", color_codes=True)
%matplotlib inline
## scikit-learn 数据转换成pandas Datafram
def skdata2df(skdata):
dfdata = pd.DataFrame(skdata.data,columns=skdata.feature_names)
dfdata["target"] = skdata.target
return dfdata
bs = skdata2df(boston)
bs.head()
00.0063218.02.310.00.5386.57565.24.09001.0296.015.3396.904.9824.0
10.027310.07.070.00.4696.42178.94.96712.0242.017.8396.909.1421.6
20.027290.07.070.00.4697.18561.14.96712.0242.017.8392.834.0334.7
30.032370.02.180.00.4586.99845.86.06223.0222.018.7394.632.9433.4
40.069050.02.180.00.4587.14754.26.06223.0222.018.7396.905.3336.2
bs.describe()
count506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000
mean3.59376111.36363611.1367790.0691700.5546956.28463468.5749013.7950439.549407408.23715418.455534356.67403212.65306322.532806
std8.59678323.3224536.8603530.2539940.1158780.70261728.1488612.1057108.707259168.5371162.16494691.2948647.1410629.197104
min0.0063200.0000000.4600000.0000000.3850003.5610002.9000001.1296001.000000187.00000012.6000000.3200001.7300005.000000
25%0.0820450.0000005.1900000.0000000.4490005.88550045.0250002.1001754.000000279.00000017.400000375.3775006.95000017.025000
50%0.2565100.0000009.6900000.0000000.5380006.20850077.5000003.2074505.000000330.00000019.050000391.44000011.36000021.200000
75%3.64742312.50000018.1000000.0000000.6240006.62350094.0750005.18842524.000000666.00000020.200000396.22500016.95500025.000000
max88.976200100.00000027.7400001.0000000.8710008.780000100.00000012.12650024.000000711.00000022.000000396.90000037.97000050.000000
fig = plt.figure()
for i,f in enumerate(boston.feature_names):
sns.jointplot(x=f, y="target", data=bs, kind='reg', size=6)

# 线性回归模型

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

lr.fit(boston.data, boston.target)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)


predictions = lr.predict(boston.data)

%matplotlib inline
f, ax = plt.subplots(figsize=(7, 5))
f.tight_layout()
ax.hist(boston.target-predictions,bins=40, label='Residuals Linear', color='b', alpha=.5);
ax.set_title("Histogram of Residuals")
ax.legend(loc='best');

lr.coef_
array([ -1.07170557e-01,   4.63952195e-02,   2.08602395e-02,
2.68856140e+00,  -1.77957587e+01,   3.80475246e+00,
7.51061703e-04,  -1.47575880e+00,   3.05655038e-01,
-1.23293463e-02,  -9.53463555e-01,   9.39251272e-03,
-5.25466633e-01])

list(zip(boston.feature_names, lr.coef_))
[('CRIM', -0.1071705565603549),
('ZN', 0.046395219529801912),
('INDUS', 0.020860239532175279),
('CHAS', 2.6885613993180009),
('NOX', -17.79575866030935),
('RM', 3.8047524602580065),
('AGE', 0.00075106170332261968),
('DIS', -1.4757587965198196),
('TAX', -0.012329346305275379),
('PTRATIO', -0.95346355469056254),
('B', 0.0093925127221887728),
('LSTAT', -0.52546663290078754)]


## 用条形图直观查看相关系数

def plotCofBar(x_feature,y_cof):
x_value = range(len(x_feature))
plt.bar(x_value, y_cof, alpha = 1, color = 'r', align="center")
plt.autoscale(tight=True)
plt.xticks([i for i in range(len(x_feature))],x_feature,rotation="90")
plt.xlabel("feature names")
plt.ylabel("cof")
plt.title("The cof of Linear regression")
plt.show()
plotCofBar(boston.feature_names,lr.coef_)

# 线性回归原理

β=(XTX)1XTy^

## 线性回归可以自动标准正态化(normalize或scale)输入数据

lr2 = LinearRegression(normalize=True)
lr2.fit(boston.data, boston.target)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

predictions2 = lr2.predict(boston.data)
%matplotlib inline
from matplotlib import pyplot as plt
f, ax = plt.subplots(figsize=(7, 5))
f.tight_layout()
ax.hist(boston.target-predictions2,bins=40, label='Residuals Linear', color='b', alpha=.5);
ax.set_title("Histogram of Residuals")
ax.legend(loc='best');

import numpy as np
print "after normalize:",np.percentile(boston.target-predictions2, 75)
print "before normalize:",np.percentile(boston.target-predictions,75)
after normalize: 1.78311579433
before normalize: 1.78311579433


• 点赞 1
• 评论 1
• 分享
x

海报分享

扫一扫，分享海报

• 收藏 1
• 手机看

分享到微信朋友圈

x

扫一扫，手机阅读

• 打赏

打赏

搬砖小工053

你的鼓励将是我创作的最大动力

C币 余额
2C币 4C币 6C币 10C币 20C币 50C币
• 一键三连

点赞Mark关注该博主, 随时了解TA的最新博文

10-23
05-02 5万+
02-13 290
10-12 2315
06-17 4万+
06-18 8812