线性回归
# 导入模块
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.datasets import make_regression # 生成线性回归的数据集
from sklearn.model_selection import cross_val_score # 交叉验证
from sklearn.model_selection import train_test_split # 拆分数据为测试集和训练集
from sklearn.linear_model import LinearRegression as LR # 回归模型
from sklearn.metrics import r2_score, explained_variance_score as EVS, mean_squared_error as MSE # 评价指标
一元线性回归
# create dataset by make_regression, 100 samples, 1 feature
X, y = make_regression(n_samples=100, n_features=1, noise=10)
X_train = X[:95]
X_test = X[95:]
y_train = y[:95]
y_test = y[95:]
# instantiate
linreg = LR().fit(X_train, y_train)
LinearRegression类拥有两个属性:
coef_ : regression coeficient,特征系数 -> list
intercept: 表示截距
print('LinearRegression`s regression coefficient is {}'.format(linreg.coef_))
print('LinearRegression`s intercept is {}'.format(linreg.intercept_))
LinearRegression`s regression coefficient is [77.2557384]
LinearRegression`s intercept is 0.775112972088503
y_pred = linreg.predict(X_test)
r2 = linreg.score(X_test, y_test) # R2
r2
0.9894870982059684
# 可视化
plt.scatter(X_train, y_train, color='black', label='train dataset')
plt.scatter(X_test, y_test, color='red', label='test dataset')
plt.plot(X_test, linreg.predict(X_test), color='blue', lw=3, label='predict plot')
plt.legend(loc='best')
# plt.xticks(())
# plt.yticks(())
plt.show()
多元线性回归
data = pd.read_csv('8.Advertising.csv')
x = data[['TV', 'Radio', 'Newspaper']]
y = data['Sales']
x.shape
(200, 3)
y.head()
0 22.1
1 10.4
2 9.3
3 18.5
4 12.9
Name: Sales, dtype: float64
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,random_state=1)
x_train.shape
(160, 3)
linreg = LR().fit(x_train, y_train)
yhat = linreg.predict(x_test)
yhat
array([21.73577184, 16.45693776, 7.65993185, 17.89202679, 18.67730671,
23.86271904, 16.33623628, 13.45649226, 9.177296 , 17.36056228,
14.4677995 , 9.85697601, 17.26057027, 16.71866935, 15.09530285,
15.58923732, 12.45188167, 17.27925151, 11.0944114 , 18.06889853,
9.33433055, 12.91345761, 8.7842804 , 10.46670654, 11.40303174,
15.03104665, 9.78479388, 19.46028647, 18.22954934, 17.1958903 ,
21.60304218, 14.71901407, 16.29205532, 12.36432281, 19.98831261,
15.37556411, 13.96678297, 10.06809496, 20.97197274, 7.45877832])
linreg.coef_
array([0.0468431 , 0.17854434, 0.00258619])
linreg.intercept_
2.907947020816433
sales = 2.908+0.047*TV + 0.179*radio + 0.003*Newspaper
评价指标
-
MSE(均方差-mean square error): 也称为方差,该统计参数计算的是拟合数据和原始数据对应点的误差的平方和的均值, RMSE是其开方后的值
M S E = ∑ i = 1 n 1 n ( y i − y i ^ ) MSE = \sum_{i=1}^n\frac{1}{n}(y_i - \hat{y_i}) MSE=∑i=1nn1(yi−yi^)
- sklearn实现代码
MSE(y_true, y_pred, *, sample_weight=None, multioutput='uniform_average', squared=True,)
-
R-square(确定系数):也称其为R2,结果为1表示模型是perfect, 结果为0表示模型是worst
R 2 = 1 − ∑ i ( y i ^ ) − y i ) 2 ∑ i ( y i ‾ − y i ) R2 = 1 - \frac{\sum_{i}(\hat{y_i}) - y_i)^2}{\sum_{i}(\overline{y_i}-y_i)} R2=1−∑i(yi−yi)∑i(yi^)−yi)2
- sklearn实现
r2_score(y_true,y_pred,*,sample_weight=None,multioutput='uniform_average',)
LinearRegression.score(X, y, sample_weight=None)
-
explained_variance_score:解释回归模型的方差得分,其值取值范围是[0,1],越接近于1说明自变量越能解释因变量的方差变化,值越小则说明效果越差。
MSE(y_test, yhat)
1.9918855518287906
np.sqrt(MSE(y_test, yhat))/y_test.mean()
0.09492798088839136
交叉验证
cross_val_score(
estimator,
X,
y=None,
,
groups=None,
scoring=None,
cv=None,
n_jobs=None,
verbose=0,
fit_params=None,
pre_dispatch='2n_jobs’,
error_score=nan,
)
cv=5,表示将所有数据集分成5份,分别为a,b,c,d,e, cross_val_score()执行的步骤:
- 将其中一份作为测试集,如a,其它四分作为训练集,最后返回一个负的均方差,
- 然后将b作为测试集,其余四份作为训练集,最后返回一个负的均方差,
- 重复上述步骤,至每一份做完一次测试集
最终返回5个均方差,即返回cv个均方差
-cross_val_score(linreg, x, y, cv=10, scoring='neg_mean_squared_error')
array([3.56038438, 3.29767522, 2.08943356, 2.82474283, 1.3027754 ,
1.74163618, 8.17338214, 2.11409746, 3.04273109, 2.45281793])
cross_val_score(linreg, x, y, cv=5, scoring='r2').mean()
0.8871063495438436
r2_score(y_test, yhat)
0.8927605914615384
r2 = linreg.score(x_test, y_test)
r2
0.8927605914615384
cross_val_score(linreg, x, y, cv=5, scoring='explained_variance').mean()
0.8924114066793429
多元非线性回归
PolynomialFeatures(
degree=2,
*,
interaction_only=False,
include_bias=True,
order=‘C’,
)
- degree: 表示多元非线性回归方程中最高次不能超过degree
- interaction_only: 为true表示只是用a和b的交互项,一般都是用False
- include_bias: 是否添加一列全部为1的偏置项
例:PolynomialFeatures(degree=2, interaction_only=False, include_bias=False) 表示对特征数据进行多项式转化,但特征为a与b时,相当于多出来a^2, a*b, b^2
多元非线性回归最终还是依靠LinearRegression类
from sklearn.preprocessing import PolynomialFeatures
po = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
x_poly = po.fit_transform(x)
pd.DataFrame(x_poly).head()
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | |
---|---|---|---|---|---|---|---|---|---|
0 | 230.1 | 37.8 | 69.2 | 52946.01 | 8697.78 | 15922.92 | 1428.84 | 2615.76 | 4788.64 |
1 | 44.5 | 39.3 | 45.1 | 1980.25 | 1748.85 | 2006.95 | 1544.49 | 1772.43 | 2034.01 |
2 | 17.2 | 45.9 | 69.3 | 295.84 | 789.48 | 1191.96 | 2106.81 | 3180.87 | 4802.49 |
3 | 151.5 | 41.3 | 58.5 | 22952.25 | 6256.95 | 8862.75 | 1705.69 | 2416.05 | 3422.25 |
4 | 180.8 | 10.8 | 58.4 | 32688.64 | 1952.64 | 10558.72 | 116.64 | 630.72 | 3410.56 |
x_poly = pd.DataFrame(x_poly, columns=['TV', 'Radio', 'Newspaper', 'TV^2', 'TV_Radio', 'TV_Newspaper',
'Radio^2', 'Radio_Newspaper', 'Newspaper^2'])
x_poly.head()
TV | Radio | Newspaper | TV^2 | TV_Radio | TV_Newspaper | Radio^2 | Radio_Newspaper | Newspaper^2 | |
---|---|---|---|---|---|---|---|---|---|
0 | 230.1 | 37.8 | 69.2 | 52946.01 | 8697.78 | 15922.92 | 1428.84 | 2615.76 | 4788.64 |
1 | 44.5 | 39.3 | 45.1 | 1980.25 | 1748.85 | 2006.95 | 1544.49 | 1772.43 | 2034.01 |
2 | 17.2 | 45.9 | 69.3 | 295.84 | 789.48 | 1191.96 | 2106.81 | 3180.87 | 4802.49 |
3 | 151.5 | 41.3 | 58.5 | 22952.25 | 6256.95 | 8862.75 | 1705.69 | 2416.05 | 3422.25 |
4 | 180.8 | 10.8 | 58.4 | 32688.64 | 1952.64 | 10558.72 | 116.64 | 630.72 | 3410.56 |
x_train2, x_test2, y_train2, y_test2 = train_test_split(x_poly, y, test_size=0.2, random_state=1)
linreg2 = LR().fit(x_train2, y_train2)
y_pred = linreg2.predict(x_test2)
y_pred
array([24.0997499 , 16.57292734, 9.21526371, 14.0757725 , 17.47080739,
24.9872432 , 17.29597402, 13.41135524, 10.05247531, 17.3492023 ,
15.06303554, 10.22243833, 17.40044869, 16.87140416, 13.36207704,
16.5053683 , 13.23115591, 12.04055116, 8.34500145, 18.71451918,
10.43160256, 13.16714134, 7.9252826 , 11.36865367, 12.28393907,
15.12992011, 9.20847543, 19.1699663 , 19.5813793 , 15.883991 ,
23.25985695, 12.21470254, 17.21216436, 12.41496974, 20.11730255,
15.6946992 , 12.00058669, 11.00100624, 22.85693936, 6.92267035])
linreg2.coef_
array([ 5.44789527e-02, 1.70437621e-02, 9.68390154e-03, -1.18664857e-04,
1.12349240e-03, -5.99683917e-05, 1.87915100e-04, -1.87139022e-05,
7.01990051e-05])
linreg2.intercept_
4.880821075940215
np.sqrt(MSE(y_test2, y_pred))/y_test2.mean()
0.03238013898269037
cross_val_score(linreg2, x_poly, y, cv=5, scoring='r2').mean()
0.98425409815801
cross_val_score(linreg2, x_poly, y, cv=5, scoring='explained_variance').mean()
0.98440792918983