集成学习-task15

集成学习案例二(蒸汽量预测)

导入库和数据

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
# 模型
import  pandas as pd 
import numpy as np 
from scipy import stats
import statsmodels
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RepeatedKFold,cross_val_predict,KFold
from sklearn.metrics import make_scorer,mean_squared_error
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.svm import LinearSVR,SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler
data_train=pd.read_csv(r'C:\Users\LiXiang\OneDrive\文档\WeChat Files\lx12633036\FileStorage\File\2021-05\CH6-集成学习之案例分享\集成学习案例分析2\train.txt',sep='\t')
data_test=pd.read_csv(r'C:\Users\LiXiang\OneDrive\文档\WeChat Files\lx12633036\FileStorage\File\2021-05\CH6-集成学习之案例分享\集成学习案例分析2\test.txt',sep='\t')
data_train
V0V1V2V3V4V5V6V7V8V9...V29V30V31V32V33V34V35V36V37target
00.5660.016-0.1430.4070.452-0.901-1.812-2.360-0.436-2.114...0.1360.109-0.6150.327-4.627-4.789-5.101-2.608-3.5080.175
10.9680.4370.0660.5660.194-0.893-1.566-2.3600.332-2.114...-0.1280.1240.0320.600-0.8430.1600.364-0.335-0.7300.676
21.0130.5680.2350.3700.112-0.797-1.367-2.3600.396-2.114...-0.0090.3610.277-0.116-0.8430.1600.3640.765-0.5890.633
30.7330.3680.2830.1650.599-0.679-1.200-2.0860.403-2.114...0.0150.4170.2790.603-0.843-0.0650.3640.333-0.1120.206
40.6840.6380.2600.2090.337-0.454-1.073-2.0860.314-2.114...0.1831.0780.3280.418-0.843-0.2150.364-0.280-0.0280.384
..................................................................
28830.190-0.025-0.1380.1610.600-0.2120.7570.584-0.0260.904...0.128-0.2080.809-0.1730.247-0.027-0.3490.5760.6860.235
28840.5070.5570.2960.1830.530-0.2370.7490.5840.5370.904...0.291-0.2870.465-0.3100.7630.498-0.349-0.615-0.3801.042
2885-0.394-0.721-0.4850.0840.1360.0340.6550.614-0.8180.904...0.291-0.1790.2680.5520.7630.498-0.3490.9510.7480.005
2886-0.219-0.282-0.344-0.0490.449-0.1400.5600.583-0.5960.904...0.2161.061-0.0511.0230.8780.610-0.230-0.3010.5550.350
28870.3680.380-0.225-0.0490.3790.0920.5500.5510.2440.904...0.0470.057-0.0420.8470.534-0.009-0.190-0.5670.3880.417

2888 rows × 39 columns

data_test
V0V1V2V3V4V5V6V7V8V9...V28V29V30V31V32V33V34V35V36V37
00.3680.380-0.225-0.0490.3790.0920.5500.5510.2440.904...-0.4490.0470.057-0.0420.8470.534-0.009-0.190-0.5670.388
10.1480.489-0.247-0.0490.122-0.2010.4870.493-0.1270.904...-0.4430.0470.5600.1760.5510.046-0.2200.008-0.2940.104
2-0.166-0.062-0.3110.046-0.0550.0630.4850.493-0.2270.904...-0.458-0.3980.1010.1990.6340.017-0.2340.0080.3730.569
30.1020.294-0.2590.051-0.1830.1480.4740.5040.0100.904...-0.456-0.3981.0070.1371.042-0.040-0.2900.008-0.6660.391
40.3000.4280.2080.051-0.0330.1160.4080.4970.1550.904...-0.458-0.7760.2910.3700.181-0.040-0.2900.008-0.140-0.497
..................................................................
1920-1.362-1.553-3.096-0.4440.3811.375-4.854-5.331-4.074-3.838...0.5250.171-4.488-5.793-4.050-1.187-0.852-2.131-2.5640.597
1921-2.698-3.452-3.620-1.066-1.3851.378-4.927-5.103-4.393-1.683...-0.4461.297-0.613-7.698-0.674-1.187-0.852-2.131-2.5641.215
1922-2.615-3.564-3.402-0.422-1.2721.121-4.223-4.315-5.196-3.407...-0.4470.5520.125-6.1110.275-1.851-1.548-1.537-2.5441.612
1923-2.661-3.646-3.271-0.699-1.2701.116-3.716-3.809-4.735-2.976...-0.4470.3181.086-5.2680.683-1.645-1.471-1.537-2.5491.431
1924-2.321-3.037-3.214-1.594-0.9101.259-3.616-3.747-4.368-2.976...-0.4420.323-0.774-5.2111.618-1.703-1.471-1.537-1.1231.988

1925 rows × 38 columns

# 这样做能够在索引的时候识别出train和test
data_train["oringin"]='train'
data_test["oringin"]='test'
data_all=pd.concat([data_train,data_test],axis=0,ignore_index=True)
data_all.head()
V0V1V2V3V4V5V6V7V8V9...V30V31V32V33V34V35V36V37targetoringin
00.5660.016-0.1430.4070.452-0.901-1.812-2.360-0.436-2.114...0.109-0.6150.327-4.627-4.789-5.101-2.608-3.5080.175train
10.9680.4370.0660.5660.194-0.893-1.566-2.3600.332-2.114...0.1240.0320.600-0.8430.1600.364-0.335-0.7300.676train
21.0130.5680.2350.3700.112-0.797-1.367-2.3600.396-2.114...0.3610.277-0.116-0.8430.1600.3640.765-0.5890.633train
30.7330.3680.2830.1650.599-0.679-1.200-2.0860.403-2.114...0.4170.2790.603-0.843-0.0650.3640.333-0.1120.206train
40.6840.6380.2600.2090.337-0.454-1.073-2.0860.314-2.114...1.0780.3280.418-0.843-0.2150.364-0.280-0.0280.384train

5 rows × 40 columns

数据分布查看

  • 这里因为是传感器的数据,即连续变量,所以使用 kdeplot(核密度估计图) 进行数据的初步分析,即EDA。
for column in data_all.columns[0:-2]:
    g=sns.kdeplot(data_all[column][data_all['oringin']=='train'],color='Red',shade=True)
    g=sns.kdeplot(data_all[column][data_all['oringin']=='test'],ax=g,color='Blue',shade=True)
    g.set_xlabel(column)
    g.set_ylabel("Frequency")
    g=g.legend(['Train','Test'],loc=2)
    plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-wuvSWEMa-1621777034424)(output_10_0.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dRETn60k-1621777034426)(output_10_1.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-z5rc3Nou-1621777034428)(output_10_2.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-LzOCDaam-1621777034429)(output_10_3.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rcYCxR7f-1621777034430)(output_10_4.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-SUOB1Bzy-1621777034430)(output_10_5.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dWrytqgD-1621777034431)(output_10_6.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-RXpC3Wt5-1621777034431)(output_10_7.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rAJ7CgpJ-1621777034432)(output_10_8.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JH4zOnr1-1621777034432)(output_10_9.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-y1D5bOJ4-1621777034432)(output_10_10.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lR4uCCEO-1621777034433)(output_10_11.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-B5fZjJAt-1621777034433)(output_10_12.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-HbFJ96qx-1621777034434)(output_10_13.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JvIo2NP6-1621777034434)(output_10_14.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-00q5icxP-1621777034435)(output_10_15.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xLogYQF2-1621777034435)(output_10_16.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-mLeSHQxO-1621777034436)(output_10_17.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Jgw8lFCu-1621777034436)(output_10_18.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-3Kv7Ixkr-1621777034437)(output_10_19.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vsX5vAYb-1621777034437)(output_10_20.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vk1rHbak-1621777034438)(output_10_21.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-KWspkzTC-1621777034438)(output_10_22.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ExB8v58l-1621777034439)(output_10_23.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uzFdhoMr-1621777034439)(output_10_24.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pOks8Kwc-1621777034439)(output_10_25.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EhdMbTfX-1621777034440)(output_10_26.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-aTbVKka3-1621777034440)(output_10_27.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dVPdjcaD-1621777034440)(output_10_28.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-COH69zTs-1621777034441)(output_10_29.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qPJ3QrwQ-1621777034441)(output_10_30.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hc01tlQ1-1621777034442)(output_10_31.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Knyi8aWC-1621777034442)(output_10_32.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-I9yBSgu4-1621777034443)(output_10_33.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-urgF9lDE-1621777034444)(output_10_34.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-po7sKRNS-1621777034444)(output_10_35.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VKaCsEsn-1621777034445)(output_10_36.svg)]

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1RVslQvP-1621777034445)(output_10_37.svg)]

  • 从上图可以看出,V5,V9,V11,V17,V22中训练集数据分布和测试集数据分布不均,所以我们删除这些特征数据
data_all.drop(['V5','V9','V11','V17','V22'],axis=1,inplace=True)

查看特征间的相关性

spearman方法

  • 不用归一化,先查看相关性
relation_train=data_all[data_all['oringin']=='train'].drop("oringin",axis=1)
plt.figure(figsize=(20,16))
colnm=relation_train.columns.tolist()
mcorr=relation_train[colnm].corr(method='spearman')# 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool)  # 构造与mcorr同维数矩阵为bool型
mask[np.triu_indices_from(mask)] = True# 角分线右侧为True
cmap = sns.diverging_palette(220, 10, as_cmap=True)  # 返回matplotlib colormap对象,调色板
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')  # 热力图(看两两相似度)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lq16nHxU-1621777034446)(output_15_0.svg)]

# 进行降维操作,即将相关性的绝对值小于阈值的特征进行删除
threshold = 0.1
corr_matrix = relation_train.corr().abs()
drop_col=corr_matrix[corr_matrix["target"]<threshold].index
drop_col
Index(['V14', 'V21', 'V25', 'V26', 'V32', 'V33', 'V34'], dtype='object')

皮尔森方法

  • 先进行归一化,再查看相关性
relation_train_1=data_all[data_all['oringin']=='train'].drop("oringin",axis=1)
cols_numeric=list(relation_train_1.columns)
def scale_minmax(col):
    return (col-col.min())/(col.max()-col.min())
scale_cols = [col for col in cols_numeric if col!='target']
relation_train_1[scale_cols] = relation_train_1[scale_cols].apply(scale_minmax,axis=0)
relation_train_1[scale_cols].describe()
V0V1V2V3V4V6V7V8V10V12...V28V29V30V31V32V33V34V35V36V37
count2888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.000000...2888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.0000002888.000000
mean0.6905280.7355210.5937450.6063010.6398760.7354180.7413370.7020120.3531590.663280...0.3620480.4017150.6340300.7605170.6317940.4592550.4844650.7348500.3363060.527854
std0.1437470.1337380.1458440.1513020.1195500.1418720.1371110.1290820.1306010.114305...0.1308610.1416440.1253380.1109030.1399790.0997820.1013530.1229170.1237330.153423
min0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%0.6254650.6954190.4972790.5151650.5863280.6592490.6823140.6534530.2918130.606750...0.2786380.3000530.5869930.7226560.5654080.4090370.4544900.6849360.2797600.427112
50%0.7270760.7662640.6091550.6099330.6529400.7671150.7740450.7285570.3697060.676042...0.2797640.3856110.6337550.7823300.6346150.4545180.4999490.7555800.3498600.519532
75%0.7839220.8126420.6943420.7141740.7121850.8356130.8369580.7810290.4320540.739069...0.4453980.4881540.6941360.8249490.7149500.5042610.5113650.7852600.4145110.622210
max1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000...1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000

8 rows × 33 columns

plt.figure(figsize=(20,16))
colnm=relation_train_1.columns.tolist()
mcorr=relation_train_1[colnm].corr(method='pearson')# 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool)  # 构造与mcorr同维数矩阵为bool型
mask[np.triu_indices_from(mask)] = True# 角分线右侧为True
cmap = sns.diverging_palette(220, 10, as_cmap=True)  # 返回matplotlib colormap对象,调色板
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f')  # 热力图(看两两相似度)
plt.show()

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-bx1l4Akf-1621777034446)(output_19_0.svg)]

threshold = 0.1
corr_matrix = relation_train_1.corr().abs()
drop_col=corr_matrix[corr_matrix["target"]<threshold].index
drop_col
Index(['V14', 'V21', 'V25', 'V26', 'V32', 'V33', 'V34'], dtype='object')
data_all.drop(drop_col,axis=1,inplace=True)

归一化

cols_numeric=list(data_all.columns)
cols_numeric.remove("oringin")
def scale_minmax(col):
    return (col-col.min())/(col.max()-col.min())
scale_cols = [col for col in cols_numeric if col!='target']
data_all[scale_cols] = data_all[scale_cols].apply(scale_minmax,axis=0)
data_all[scale_cols].describe()
V0V1V2V3V4V6V7V8V10V12...V23V24V27V28V29V30V31V35V36V37
count4813.0000004813.0000004813.0000004813.0000004813.0000004813.0000004813.0000004813.0000004813.0000004813.000000...4813.0000004813.0000004813.0000004813.0000004813.0000004813.0000004813.0000004813.0000004813.0000004813.000000
mean0.6941720.7213570.6023000.6031390.5237430.7488230.7457400.7156070.3485180.578507...0.7444380.3567120.8814010.3426530.3886830.5894590.7927090.7628730.3323850.545795
std0.1441980.1314430.1406280.1524620.1064300.1325600.1325770.1181050.1348820.105088...0.1340850.2655120.1282210.1407310.1334750.1307860.1029760.1020370.1274560.150356
min0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%0.6266760.6794160.5144140.5038880.4781820.6833240.6969380.6649340.2843270.532892...0.7193620.0406160.8885750.2787780.2924450.5500920.7618160.7272730.2705840.445647
50%0.7294880.7524970.6170720.6142700.5358660.7741250.7719740.7428840.3664690.591635...0.7888170.3817360.9160150.2799040.3757340.5944280.8150550.8000200.3470560.539317
75%0.7901950.7995530.7004640.7104740.5850360.8422590.8364050.7908350.4329650.641971...0.7927060.5747280.9325550.4130310.4718370.6507980.8522290.8000200.4148610.643061
max1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000...1.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.0000001.000000

8 rows × 26 columns

特征工程

  • 绘图显示Box-Cox变换对数据分布影响,Box-Cox用于连续的响应变量不满足正态分布的情况。在进行Box-Cox变换之后,可以一定程度上减小不可观测的误差和预测变量的相关性。
fcols=6
frows=len(cols_numeric)-1
plt.figure(figsize=(4*fcols,4*frows))
i=0

for var in cols_numeric:
    if var!='target':
        dat=data_all[[var,'target']].dropna()

        i+=1
        plt.subplot(frows,fcols,i)
        sns.distplot(dat[var] , fit=stats.norm)
        plt.title(var+'Original')
        plt.xlabel('')


        i+=1
        plt.subplot(frows,fcols,i)
        _=stats.probplot(dat[var],plot=plt)
        plt.title('skew='+'{:.4f}'.format(stats.skew(dat[var])))
        plt.xlabel('')
        plt.ylabel('')

        i+=1
        plt.subplot(frows,fcols,i)
        plt.plot(dat[var], dat['target'],'.',alpha=0.5)
        plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[var], dat['target'])[0][1]))

        i+=1
        plt.subplot(frows,fcols,i)
        trans_var, lambda_var = stats.boxcox(dat[var].dropna()+1)
        trans_var = scale_minmax(trans_var)      
        sns.distplot(trans_var,fit=stats.norm)
        plt.title(var+' Tramsformed')
        plt.xlabel('')

        i+=1
        plt.subplot(frows,fcols,i)
        _=stats.probplot(trans_var,plot=plt)
        plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
        plt.xlabel('')
        plt.ylabel('')

        i+=1
        plt.subplot(frows,fcols,i)
        plt.plot(trans_var, dat['target'],'.',alpha=0.5)
        plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var, dat['target'])[0][1]))
fcols = 6
frows = len(cols_numeric)-1
plt.figure(figsize=(4*fcols,4*frows))
i=0
for var in cols_numeric:
    if var!='target':
        dat = data_all[[var, 'target']].dropna()
        i+=1
        plt.subplot(frows,fcols,i)
        sns.distplot(dat[var] , fit=stats.norm)
        plt.title(var+' Original')
        plt.xlabel('')
        i+=1
        plt.subplot(frows,fcols,i)
        _=stats.probplot(dat[var], plot=plt)
        plt.title('skew='+'{:.4f}'.format(stats.skew(dat[var])))
        plt.xlabel('')
        plt.ylabel('')
        i+=1
        plt.subplot(frows,fcols,i)
        plt.plot(dat[var], dat['target'],'.',alpha=0.5)
        plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[var], dat['target'])[0][1]))
        i+=1
        plt.subplot(frows,fcols,i)
        trans_var, lambda_var = stats.boxcox(dat[var].dropna()+1)
        trans_var = scale_minmax(trans_var)      
        sns.distplot(trans_var , fit=stats.norm)
        plt.title(var+' Tramsformed')
        plt.xlabel('')
        i+=1
        plt.subplot(frows,fcols,i)
        _=stats.probplot(trans_var, plot=plt)
        plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
        plt.xlabel('')
        plt.ylabel('')
        i+=1
        plt.subplot(frows,fcols,i)
        plt.plot(trans_var, dat['target'],'.',alpha=0.5)
        plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值