财政收入的交叉验证

随便记录一下,财政收入的数据搜一下就有了。
本文主要是三种回归的对比:岭回归、lasso回归、弹性网络回归。
ps:不要问问题,因为我也不知道答案(如果有人问的话)

描述性分析

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
sns.set() #if you want to use seaborn themes with matplotlib functions
import warnings
warnings.filterwarnings('ignore')
rand_state= 1000
df=pd.read_csv(r'D:\桌面\python代码+数据\chapter6\demo\data\data.csv')
df.head()

在这里插入图片描述

sns.scatterplot(x='x1', y='y', data=df)
plt.show()

在这里插入图片描述

#在训练之前,先进行标准化
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
df_sc= scaler.fit_transform(df)
df_sc[0]

在这里插入图片描述

df.describe()

在这里插入图片描述

sns.scatterplot(x='x2', y='y', data=df_sc)
plt.show()

在这里插入图片描述

划分数据

y = df_sc['y']
X = df_sc.drop('y', axis=1) # becareful inplace= False

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=rand_state)
X_train.head()

在这里插入图片描述
从上表中可以看出,变量彼此高度相关。 让我们首先运行线性回归(使用 statsmodels.api),并将结果视为基准。

X_test_wc = sm.add_constant(X_test)
X_train_wc = sm.add_constant(X_train)

热力图

corrdf=df.corr()
corrdf
plt.figure(figsize=(10,10))
sns.heatmap(corrdf,annot=True,cmap='RdGy')
plt.show()

在这里插入图片描述

label=['y']
gdp=(list(df.y))
plt.figure(figsize=(6,4))
plt.boxplot(gdp,notch=True,labels=label,meanline=True)
plt.show()

在这里插入图片描述

OLS回归的结果

model = sm.OLS(y_train, X_train_wc).fit()#添加截距列
model.summary()

在这里插入图片描述

from sklearn.linear_model import LinearRegression, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.linear_model import Ridge
model_linear = LinearRegression()
model_ridge = Ridge(alpha=10)
model_lasso = Lasso(alpha=10)
model_net = ElasticNet(alpha=10)
y_hat_linear= model_linear.fit(X_train, y_train).predict(X_test)
y_hat_ridge = model_ridge.fit(X_train, y_train).predict(X_test)
y_hat_lasso = model_lasso.fit(X_train, y_train).predict(X_test)
y_hat_net = model_net.fit(X_train, y_train).predict(X_test)
df_predictions = pd.DataFrame({'y_test':y_test,
 'y_hat_linear':y_hat_linear,
 'y_hat_ridge':y_hat_ridge,
 'y_hat_lasso':y_hat_lasso,
 'y_hat_net':y_hat_net})
df_predictions.head()

在这里插入图片描述

df.drop('y', axis=1, inplace=False).columns

在这里插入图片描述

coefficients = pd.DataFrame({'Features':df.drop('y', axis=1, inplace=False).columns})
coefficients['model_lin']= model_linear.coef_
coefficients['model_ridge']= model_ridge.coef_
coefficients['model_lasso']= model_lasso.coef_
coefficients['model_net']= model_net.coef_
coefficients

在这里插入图片描述

评价各模型RMSE

MSE_test = np.mean(np.square(df_predictions['y_test'] - df_predictions['y_hat_linear']))
RMSE_test = np.sqrt(MSE_test)
np.round(RMSE_test,3)

0.159

MSE_test = np.mean(np.square(df_predictions['y_test'] - df_predictions['y_hat_ridge']))
RMSE_test = np.sqrt(MSE_test)
np.round(RMSE_test,3)

0.094

MSE_test = np.mean(np.square(df_predictions['y_test'] - df_predictions['y_hat_net']))
RMSE_test = np.sqrt(MSE_test)
np.round(RMSE_test,3)

1.048

绘制回归系数与 alpha 的对比图

1) 岭回归系数与阿尔法

alpha_ridge = 10**np.linspace(-2,4,100)
plt.figure(figsize=(12,5))
plt.plot(alpha_ridge,'or' )
plt.xlabel('steps')
plt.ylabel('alpha (lambda)')
plt.show()

在这里插入图片描述

ridge = Ridge()
coefs_ridge = []

for i in alpha_ridge:
    ridge.set_params(alpha = i)
    ridge.fit(X_train, y_train)
    coefs_ridge.append(ridge.coef_)
    
np.shape(coefs_ridge)

(100, 13)

plt.figure(figsize=(12,10))
ax = plt.gca()
ax.plot(alpha_ridge, coefs_ridge)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights: scaled coefficients')
plt.title('Ridge regression coefficients Vs. alpha')
plt.legend(df.drop('y',axis=1, inplace=False).columns)
plt.show()

在这里插入图片描述
2) 套索回归系数与阿尔法

alpha_lasso = 10**np.linspace(-3,1,100)
lasso = Lasso()
coefs_lasso = []
for i in alpha_lasso:
    lasso.set_params(alpha = i)
    lasso.fit(X_train, y_train)
    coefs_lasso.append(lasso.coef_)
 
np.shape(coefs_lasso)

(100, 13)

plt.figure(figsize=(12,10))
ax = plt.gca()
ax.plot(alpha_lasso, coefs_lasso)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights: scaled coefficients')
plt.title('Lasso regression coefficients Vs. alpha')
plt.legend(df.drop('y',axis=1, inplace=False).columns)
plt.show()

在这里插入图片描述
3) 弹性净回归系数与 alph

 alpha_elasticnet = 10**np.linspace(-3,2,100)
 elasticnet = ElasticNet()
coefs_elasticnet = []
for i in alpha_elasticnet:
 elasticnet.set_params(alpha = i)
 elasticnet.fit(X_train, y_train)
 coefs_elasticnet.append(elasticnet.coef_)
 
np.shape(coefs_elasticnet)

(100, 13)

plt.figure(figsize=(12,10))
ax = plt.gca()
ax.plot(alpha_elasticnet, coefs_elasticnet)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights: scaled coefficients')
plt.title('Elastic Net regression coefficients Vs. alpha')
plt.legend(df.drop('y',axis=1, inplace=False).columns)
plt.show()

在这里插入图片描述

交叉验证(找lambda)

1) 岭回归的最优阿尔法

 ridgecv = RidgeCV()
ridgecv.fit(X_train, y_train)
ridgecv.alpha_

0.1

alpha_ridge_opt = ridgecv.alpha_

2) 套索回归的最优阿尔法

lassocv = LassoCV()
lassocv.fit(X_train, y_train)
lassocv.alpha_
alpha_lasso_opt = lassocv.alpha_

0.0013485478432168933
3) 弹性净回归的最优 alpha

elasticnetcv = ElasticNetCV()
elasticnetcv.fit(X_train, y_train)
elasticnetcv.alpha_

0.0019027476880882372

elasticnetcv.l1_ratio

0.5

alpha_elasticnet_opt = elasticnetcv.alpha_
#可视化
from sklearn.model_selection import cross_val_score
import sklearn.metrics
RMSE_CV=[]
iterator=np.arange(0.0,0.02,0.001)
for i in iterator:
    MSE=-cross_val_score(estimator=ElasticNet(alpha=i),X=X_train,y=y_train,cv=5,scoring="neg_mean_squared_error")
    RMSE_CV.append(np.sqrt(MSE).mean())
    
output=pd.DataFrame(list(iterator),columns=['lambda_ElasticNet'])
output['RMSE_CV']=RMSE_CV

output.head()

在这里插入图片描述

output['RMSE_CV'].idxmin()

3

output['lambda_ElasticNet'][output['RMSE_CV'].idxmin()]

0.003

sns.lineplot(x='lambda_ElasticNet', y='RMSE_CV', data=output , color='r', label="RMSE_CV vs lambda_ridge")
plt.show()

在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值