代码实现
import numpy as np
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
# 处理warning
from sklearn.exceptions import ConvergenceWarning
import matplotlib as mpl
import warnings
def xss(y, y_hat):
y = y.ravel()
y_hat = y_hat.ravel()
# Version 1
tss = ((y - np.average(y)) ** 2).sum()
rss = ((y_hat - y) ** 2).sum()
ess = ((y_hat - np.average(y)) ** 2).sum()
r2 = 1 - rss / tss
# print 'RSS:', rss, '\t ESS:', ess
# print 'TSS:', tss, 'RSS + ESS = ', rss + ess
tss_list.append(tss)
rss_list.append(rss)
ess_list.append(ess)
ess_rss_list.append(rss + ess)
# Version 2
# tss = np.var(y)
# rss = np.average((y_hat - y) ** 2)
# r2 = 1 - rss / tss
corr_coef = np.corrcoef(y, y_hat)[0, 1]
return r2, corr_coef
if __name__ == "__main__":
# 设置将某类别的警告忽视
warnings.filterwarnings(action='ignore', category=ConvergenceWarning)
# 设置随机数种子,在试验模型中使用,但在工作中很少使用
np.random.seed(0)
# 设置输出样式-参数linewidth设置显示宽度
np.set_printoptions(linewidth=1000)
# 设置9个点
N = 9
# 设置x,0-6等间隔的数加上高斯噪声
x = np.linspace(0, 6, N) + np.random.randn(N)
x = np.sort(x)
# 设置y
y = x**2 - 4*x - 3 + np.random.randn(N)
# 设置成列向量,将行变成列
x.shape = -1, 1
y.shape = -1, 1
models = [Pipeline([
# PolynomialFeatures用来生成关于x的矩阵,其中degree表示多项式的次数,include_bias默认为True表示会包含1
('poly', PolynomialFeatures()),
# fit_intercept = False表示不去计算截距项,避免与上面的include_bias=True重复
('linear', LinearRegression(fit_intercept=False))]),
Pipeline([
('poly', PolynomialFeatures()),
# RidgeCV封装了网格搜索调优,相当于RidgeCV + GridsearchCV,alphas表示正则项
('linear', RidgeCV(alphas=np.logspace(-3, 2, 50), fit_intercept=False))]),
Pipeline([
('poly', PolynomialFeatures()),
('linear', LassoCV(alphas=np.logspace(-3, 2, 50), fit_intercept=False))]),
Pipeline([
('poly', PolynomialFeatures()),
# l1_ratio表示L1与L2的比例
('linear', ElasticNetCV(alphas=np.logspace(-3, 2, 50), l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
fit_intercept=False))])
]
mpl.rcParams['font.sans-serif'] = [u'simHei']
mpl.rcParams['axes.unicode_minus'] = False
# 不用科学计数法,用小数点来显示
np.set_printoptions(suppress=True)
plt.figure(figsize=(18, 12), facecolor='w')
# 阶数的数组
d_pool = np.arange(1, N, 1) # 阶
m = d_pool.size
# 设置渐变色
clrs = []
for c in np.linspace(16711680,255,m):
c = int(c)
clrs.append('#%06x' % c)
line_width = np.linspace(5, 2, m)
titles = u'线性回归', u'Ridge回归', u'LASSO', u'ElasticNet'
tss_list = []
rss_list = []
ess_list = []
ess_rss_list = []
# 迭代画4个图
for t in range(4):
model = models[t]
plt.subplot(2, 2, t+1)
# zorder=N表示放置到最上层
plt.plot(x, y, 'ro', ms=10, zorder=N)
for i, d in enumerate(d_pool):
# poly__degree=d表示设置PolynomialFeatures()中degree参数,矩阵的阶数或多项式的次数
model.set_params(poly__degree=d)
# numpy.ravel表示将多维数组降为一维
model.fit(x, y.ravel())
# get_params表示获得线性数据本身、值等
lin = model.get_params('linear')['linear']
output = u'%s:%d阶,系数为:' % (titles[t], d)
if hasattr(lin, 'alpha_'):
idx = output.find(u'系数')
output = output[:idx] + (u'alpha=%.6f,' % lin.alpha_) + output[idx:]
if hasattr(lin, 'l1_ratio_'): # 根据交叉验证结果,从输入l1_ratio(list)中选择的最优l1_ratio_(float)
idx = output.find(u'系数')
output = output[:idx] + (u'l1_ratio=%.6f,' % lin.l1_ratio_) + output[idx:]
print(output, lin.coef_.ravel())
x_hat = np.linspace(x.min(), x.max(), num=100)
x_hat.shape = -1, 1
y_hat = model.predict(x_hat)
s = model.score(x, y)
r2, corr_coef = xss(y, model.predict(x))
# print 'R2和相关系数:', r2, corr_coef
# print 'R2:', s, '\n'
z = N - 1 if (d == 2) else 0
label = u'%d阶,$R^2$=%.3f' % (d, s)
if hasattr(lin, 'l1_ratio_'):
label += u',L1 ratio=%.2f' % lin.l1_ratio_
# lc与lw
plt.plot(x_hat, y_hat, color=clrs[i], lw=line_width[i],alpha=0.75, label=label, zorder=z)
plt.legend(loc='upper left')
plt.grid(True)
plt.title(titles[t], fontsize=18)
plt.xlabel('X', fontsize=16)
plt.ylabel('Y', fontsize=16)
plt.tight_layout(1, rect=(0, 0, 1, 0.95))
plt.suptitle(u'多项式曲线拟合比较', fontsize=22)
plt.show()
y_max = max(max(tss_list), max(ess_rss_list)) * 1.05
plt.figure(figsize=(9, 7), facecolor='w')
t = np.arange(len(tss_list))
plt.plot(t, tss_list, 'ro-', lw=2, label=u'TSS(Total Sum of Squares)')
plt.plot(t, ess_list, 'mo-', lw=1, label=u'ESS(Explained Sum of Squares)')
plt.plot(t, rss_list, 'bo-', lw=1, label=u'RSS(Residual Sum of Squares)')
plt.plot(t, ess_rss_list, 'go-', lw=2, label=u'ESS+RSS')
plt.ylim((0, y_max))
plt.legend(loc='center right')
plt.xlabel(u'实验:线性回归/Ridge/LASSO/Elastic Net', fontsize=15)
plt.ylabel(u'XSS值', fontsize=15)
plt.title(u'总平方和TSS=?', fontsize=18)
plt.grid(True)
plt.show()
运行结果
线性回归:1阶,系数为: [-12.12113792 3.05477422]
线性回归:2阶,系数为: [-3.23812184 -3.36390661 0.90493645]
线性回归:3阶,系数为: [-3.90207326 -2.61163034 0.66422328 0.02290431]
线性回归:4阶,系数为: [-8.20599769 4.20778207 -2.85304163 0.73902338 -0.05008557]
线性回归:5阶,系数为: [ 21.59733285 -54.12232017 38.43116219 -12.68651476 1.98134176 -0.11572371]
线性回归:6阶,系数为: [ 14.73304785 -37.87317494 23.67462342 -6.07037979 0.42536833 0.06803132 -0.00859246]
线性回归:7阶,系数为: [ 314.30344622 -827.89446924 857.33293186 -465.46543638 144.21883851 -25.67294678 2.44658612 -0.09675941]
线性回归:8阶,系数为: [-1189.50149198 3643.69109456 -4647.92941149 3217.22814712 -1325.87384337 334.32869072 -50.57119119 4.21251817 -0.148521 ]
Ridge回归:1阶,alpha=0.109854,系数为: [-11.21592213 2.85121516]
Ridge回归:2阶,alpha=0.138950,系数为: [-2.90423989 -3.49931368 0.91803171]
Ridge回归:3阶,alpha=0.068665,系数为: [-3.47165245 -2.85078293 0.69245987 0.02314415]
Ridge回归:4阶,alpha=0.222300,系数为: [-2.84560266 -1.99887417 -0.40628792 0.33863868 -0.02674442]
Ridge回归:5阶,alpha=1.151395,系数为: [-1.68160373 -1.52726943 -0.8382036 0.2329258 0.03934251 -0.00663323]
Ridge回归:6阶,alpha=0.001000,系数为: [ 0.53724068 -6.00552086 -3.75961826 5.64559118 -2.21569695 0.36872911 -0.0222134 ]
Ridge回归:7阶,alpha=0.033932,系数为: [-2.38021238 -2.26383055 -1.47715232 0.00763115 1.12242917 -0.52769633 0.09199202 -0.00560197]
Ridge回归:8阶,alpha=0.138950,系数为: [-2.19287493 -1.9189901 -1.21620537 -0.19324078 0.49303633 0.05458084 -0.09693178 0.02114898 -0.00140213]
LASSO:1阶,alpha=0.175751,系数为: [-10.77156998 2.7439868 ]
LASSO:2阶,alpha=0.001000,系数为: [-3.29932625 -3.31989869 0.89878903]
LASSO:3阶,alpha=0.021210,系数为: [-4.7961774 -1.4641442 0.27751645 0.06064991]
LASSO:4阶,alpha=0.026827,系数为: [-4.80161479 -1.48746734 0.29643034 0.05838007 -0.0000783 ]
LASSO:5阶,alpha=0.449843,系数为: [-0. -3.48456685 -0. 0.18939592 0.00573927 -0.00234922]
LASSO:6阶,alpha=0.001000,系数为: [-4.53546398 -1.70335188 0.29896515 0.05237738 0.00489432 0.00007551 -0.00010944]
LASSO:7阶,alpha=0.004095,系数为: [-4.45986956 -1.6062364 0.22968675 0.05092706 0.00593169 0.00043962 -0.00002739 -0.00002142]
LASSO:8阶,alpha=0.008286,系数为: [-4.50222791 -1.42298846 0.15849996 0.0476696 0.00628362 0.00067038 0.00003278 -0.00000896 -0.00000386]
ElasticNet:1阶,alpha=0.021210,l1_ratio=0.100000,系数为: [-10.74762959 2.74580662]
ElasticNet:2阶,alpha=0.042919,l1_ratio=0.100000,系数为: [-2.6843532 -3.51577212 0.91244926]
ElasticNet:3阶,alpha=0.005179,l1_ratio=0.100000,系数为: [-4.30026078 -1.98209666 0.42999316 0.04710447]
ElasticNet:4阶,alpha=0.054287,l1_ratio=0.100000,系数为: [-2.63969126 -2.8400611 0.43377604 0.09099957 -0.00474193]
ElasticNet:5阶,alpha=0.568987,l1_ratio=0.100000,系数为: [-0.96437374 -1.29713401 -0.91814848 0.29096106 0.01038465 -0.00323145]
ElasticNet:6阶,alpha=0.001000,l1_ratio=1.000000,系数为: [-4.53546398 -1.70335188 0.29896515 0.05237738 0.00489432 0.00007551 -0.00010944]
ElasticNet:7阶,alpha=0.004095,l1_ratio=1.000000,系数为: [-4.45986956 -1.6062364 0.22968675 0.05092706 0.00593169 0.00043962 -0.00002739 -0.00002142]
ElasticNet:8阶,alpha=0.008286,l1_ratio=1.000000,系数为: [-4.50222791 -1.42298846 0.15849996 0.0476696 0.00628362 0.00067038 0.00003278 -0.00000896 -0.00000386]