机器学习回归算法拟合多项式

code:

import numpy as np
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.preprocessing import PolynomialFeatures
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
import matplotlib as mpl
import warnings
# 计算统计参数TSS RSS R
def xss(y, y_hat):
	# y转置
	y = y.ravel()
	y_hat = y_hat.ravel()
	# 都是利用差平方和公式计算
	tss = ((y - np.average(y)) ** 2).sum()
	rss = ((y_hat - y) ** 2).sum()
	ess = ((y_hat - np.average(y)) ** 2).sum()
	# 统计学中R参数计算公式
	r2 = 1 - rss/tss
	print('Rss:', rss)
	print('Ess:', ess)
	print('Rss + Ess:', rss + ess)

	tss_list.append(tss)
	rss_list.append(rss)
	ess_list.append(ess)
	ess_rss_list.append(rss + ess)
	# 得到y和y_hat的相关系数
	corr_coef = np.corrcoef(y, y_hat)[0, 1]
	return r2, corr_coef

if __name__ =='__main__':
	warnings.filterwarnings("ignore")
	np.random.seed(0)
	np.set_printoptions(linewidth=1000)
	N = 9
	x = np.linspace(0, 6, N) + np.random.randn(N)
	x = np.sort(x)
	y = x**2 - 4*x - 3 + np.random.randn(N)
	x.shape = -1, 1
	y.shape = -1, 1
	# 构建几个相关的线性模型回归,Ridge,LassoCV以及ElasticNetCV
	models = [Pipeline([('poly', PolynomialFeatures()), ('linear', LinearRegression(fit_intercept=False))]),
		Pipeline([('poly', PolynomialFeatures()), ('linear', RidgeCV(alphas=np.logspace(-3, 2, 50), fit_intercept=False))]),
		Pipeline([('poly', PolynomialFeatures()), ('linear', LassoCV(alphas=np.logspace(-3, 2, 50), fit_intercept=False))]),
		Pipeline([('poly', PolynomialFeatures()), ('linear', ElasticNetCV(alphas=np.logspace(-3, 2, 50), l1_ratio=[.1, .5, .7, .9, .95, .99, 1], fit_intercept=False))])
		]

	np.set_printoptions(suppress=True)
	plt.figure(figsize=(15, 15), facecolor='w')
	d_pool = np.arange(1, N, 1)
	m = d_pool.size
	# 存颜色的list
	clrs = []
	for c in np.linspace(16711680, 255, m):
		clrs.append('#%06x' % int(c))
	line_width = np.linspace(5, 2, m)
	titles = 'linear regression', 'Ridge regression', 'Lasso', 'ElasticNet'
	tss_list = []
	rss_list = []
	ess_list = []
	ess_rss_list = []

	for t in range(4):
		model = models[t]
		plt.subplot(2, 2, t+1)
		plt.plot(x, y, 'ro', ms=10, zorder=N)
		for i, d in enumerate(d_pool):
			model.set_params(poly__degree=d)
			model.fit(x, y.ravel())
			lin = model.get_params('linear')['linear']
			output = '%s:%d level, parameters:'%(titles[t], d)
			if hasattr(lin, 'alpha_'):
				idx = output.find('parameters')
				output = output[:idx] + ('alpha = %.6f, ' % lin.alpha_) + output[idx:]
			# 这里使用交叉验证,从输入的l1_ratio(list)中选择一个最优的l1_ratio(float)值
			if hasattr(lin, 'l1_ratio_'):
				idx = output.find('parameters')
				output = output[:idx] + ('l1_ratio = %.6f, ' % lin.l1_ratio_) + output[idx:]
			print("output:\n", output)
			print("lin.coef_.ravel():\n", lin.coef_.ravel())

			x_hat = np.linspace(x.min(), x.max(), num=100)
			x_hat.shape = -1, 1
			y_hat = model.predict(x_hat)
			s= model.score(x, y)
			r2, corr_coef = xss(y, model.predict(x))
			print("R2 and corrlated params:", r2, corr_coef)
			print('R2:', s, '\n')

			z = N - 1 if (d == 2) else 0
			label = '%d level, $R^2 $=%.3f' %(d, s)
			if hasattr(lin, 'l1_ratio_'):
				label += ', L1 ration=%.2f' % lin.l1_ratio_

			plt.plot(x_hat, y_hat, color=clrs[i], lw=line_width[i], alpha=0.75, label=label, zorder=z)
		plt.legend(loc='upper left')
		plt.grid(True)
		plt.title(titles[t], fontsize=18)
		plt.xlabel("X", fontsize=15)
		plt.ylabel("Y", fontsize=15)

	plt.tight_layout(pad=2.5, w_pad=0.5, rect=(0, 0, 1, 0.95))
	# plt.tight_layout()
	plt.suptitle('multiply curve fitness compare', fontsize=22)
	plt.show()

	y_max = max(max(tss_list), max(ess_rss_list)) * 1.05
	plt.figure(figsize=(15,15), facecolor='w')
	t = np.arange(len(tss_list))
	plt.plot(t, tss_list, 'ro-', lw=2, label='Tss(Total Sum of Squares)')
	plt.plot(t, ess_list, 'mo-', lw=1, label='Ess(Explained Sum of Squares)')
	plt.plot(t, rss_list, 'bo-', lw=1, label='Ess(Residual Sum of Squares)')
	plt.plot(t, ess_rss_list, 'go-', lw=2, label='ESS + RSS')
	plt.ylim((0, y_max))
	plt.legend(loc='center right')
	plt.xlabel('trial:linear regression/RIdge?Lasso?ElasticNet', fontsize=15)
	plt.ylabel('XSS value', fontsize=15)
	plt.title('Total Sum Of Tss = ?', fontsize=18)
	plt.grid(True)
	plt.show()


第一张图拟合出来的效果只是回归算法的loss function不同,但是出来的效果明显后两种要好。


接着,我们使用决策树以及bagging的决策树进行拟合看看效果:(关于bagging的决策树,就是GBDT,原理部分这个文章不累述)

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.linear_model import RidgeCV
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures

def f(x):
	return 0.5*np.exp(-(x + 3)**2) + np.exp(-x**2) + 0.5*np.exp(-(x - 3)**2)

if __name__ == '__main__':
	np.random.seed(0)
	N = 500
	# 得到200个在[-5, 5]的数据
	x = np.random.rand(N) *10 - 5
	x = np.sort(x)
	y = f(x) + 0.05*np.random.randn(N)
	x.shape = -1, 1

	ridge = RidgeCV(alphas=np.logspace(-3, 2, 10), fit_intercept=False)
	ridged = Pipeline([('poly', PolynomialFeatures(degree=10)), ('Ridge', ridge)])
	bagging_ridged = BaggingRegressor(ridged, n_estimators=100, max_samples=0.3)
	dtr = DecisionTreeRegressor(max_depth=6)
	regs=[
		('DecisionTree Regressor', dtr),
		('Ridge Regressor(6 Degree)', ridged),
		('Bagging Ridge(6 Degree)', bagging_ridged),
		('Bagging DecisionTree Regressor', BaggingRegressor(dtr, n_estimators=100, max_samples=0.3))
	]
	x_test = np.linspace(1.1*x.min(), 1.1*x.max(), 1000)
	plt.figure(figsize=(12, 8), facecolor='w')
	plt.plot(x, y, 'ro', label='train datas')
	plt.plot(x_test, f(x_test), color='k', lw=4, label='real datas')
	clrs = 'bmyg'
	for i, (name, reg) in enumerate(regs):
		reg.fit(x, y)
		y_test = reg.predict(x_test.reshape(-1, 1))
		plt.plot(x_test, y_test.ravel(), color=clrs[i], lw=i+1, label=name, zorder=6-i)
	plt.legend(loc='upper left')
	plt.xlabel('x', fontsize=15)
	plt.ylabel('y', fontsize=15)
	plt.title('regression curve fittness', fontsize=20)
	plt.ylim((-0.2, 1.2))
	plt.tight_layout(True)
	plt.grid(True)
	plt.show()

显然,使用线性拟合,对训练数据拟合效果是很好的,大部分都能落在拟合曲线上,而决策树就形成锯齿状,拟合的效果也不及线性拟合,这也是为什么说线性拟合分类器是强分类器,而决策树分类器是弱分类器。加入bagging(GBDT)后,拟合效果改善很大,不过其效果也不会达到线性拟合那么“完美”。
  • 2
    点赞
  • 15
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值