集成学习案例二(蒸汽量预测)
导入库和数据
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns
# 模型
import pandas as pd
import numpy as np
from scipy import stats
import statsmodels
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV,RepeatedKFold,cross_val_predict,KFold
from sklearn.metrics import make_scorer,mean_squared_error
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn.svm import LinearSVR,SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures,MinMaxScaler,StandardScaler
data_train=pd.read_csv(r'C:\Users\LiXiang\OneDrive\文档\WeChat Files\lx12633036\FileStorage\File\2021-05\CH6-集成学习之案例分享\集成学习案例分析2\train.txt',sep='\t')
data_test=pd.read_csv(r'C:\Users\LiXiang\OneDrive\文档\WeChat Files\lx12633036\FileStorage\File\2021-05\CH6-集成学习之案例分享\集成学习案例分析2\test.txt',sep='\t')
data_train
V0 | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V29 | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | target | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.566 | 0.016 | -0.143 | 0.407 | 0.452 | -0.901 | -1.812 | -2.360 | -0.436 | -2.114 | ... | 0.136 | 0.109 | -0.615 | 0.327 | -4.627 | -4.789 | -5.101 | -2.608 | -3.508 | 0.175 |
1 | 0.968 | 0.437 | 0.066 | 0.566 | 0.194 | -0.893 | -1.566 | -2.360 | 0.332 | -2.114 | ... | -0.128 | 0.124 | 0.032 | 0.600 | -0.843 | 0.160 | 0.364 | -0.335 | -0.730 | 0.676 |
2 | 1.013 | 0.568 | 0.235 | 0.370 | 0.112 | -0.797 | -1.367 | -2.360 | 0.396 | -2.114 | ... | -0.009 | 0.361 | 0.277 | -0.116 | -0.843 | 0.160 | 0.364 | 0.765 | -0.589 | 0.633 |
3 | 0.733 | 0.368 | 0.283 | 0.165 | 0.599 | -0.679 | -1.200 | -2.086 | 0.403 | -2.114 | ... | 0.015 | 0.417 | 0.279 | 0.603 | -0.843 | -0.065 | 0.364 | 0.333 | -0.112 | 0.206 |
4 | 0.684 | 0.638 | 0.260 | 0.209 | 0.337 | -0.454 | -1.073 | -2.086 | 0.314 | -2.114 | ... | 0.183 | 1.078 | 0.328 | 0.418 | -0.843 | -0.215 | 0.364 | -0.280 | -0.028 | 0.384 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
2883 | 0.190 | -0.025 | -0.138 | 0.161 | 0.600 | -0.212 | 0.757 | 0.584 | -0.026 | 0.904 | ... | 0.128 | -0.208 | 0.809 | -0.173 | 0.247 | -0.027 | -0.349 | 0.576 | 0.686 | 0.235 |
2884 | 0.507 | 0.557 | 0.296 | 0.183 | 0.530 | -0.237 | 0.749 | 0.584 | 0.537 | 0.904 | ... | 0.291 | -0.287 | 0.465 | -0.310 | 0.763 | 0.498 | -0.349 | -0.615 | -0.380 | 1.042 |
2885 | -0.394 | -0.721 | -0.485 | 0.084 | 0.136 | 0.034 | 0.655 | 0.614 | -0.818 | 0.904 | ... | 0.291 | -0.179 | 0.268 | 0.552 | 0.763 | 0.498 | -0.349 | 0.951 | 0.748 | 0.005 |
2886 | -0.219 | -0.282 | -0.344 | -0.049 | 0.449 | -0.140 | 0.560 | 0.583 | -0.596 | 0.904 | ... | 0.216 | 1.061 | -0.051 | 1.023 | 0.878 | 0.610 | -0.230 | -0.301 | 0.555 | 0.350 |
2887 | 0.368 | 0.380 | -0.225 | -0.049 | 0.379 | 0.092 | 0.550 | 0.551 | 0.244 | 0.904 | ... | 0.047 | 0.057 | -0.042 | 0.847 | 0.534 | -0.009 | -0.190 | -0.567 | 0.388 | 0.417 |
2888 rows × 39 columns
data_test
V0 | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V28 | V29 | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.368 | 0.380 | -0.225 | -0.049 | 0.379 | 0.092 | 0.550 | 0.551 | 0.244 | 0.904 | ... | -0.449 | 0.047 | 0.057 | -0.042 | 0.847 | 0.534 | -0.009 | -0.190 | -0.567 | 0.388 |
1 | 0.148 | 0.489 | -0.247 | -0.049 | 0.122 | -0.201 | 0.487 | 0.493 | -0.127 | 0.904 | ... | -0.443 | 0.047 | 0.560 | 0.176 | 0.551 | 0.046 | -0.220 | 0.008 | -0.294 | 0.104 |
2 | -0.166 | -0.062 | -0.311 | 0.046 | -0.055 | 0.063 | 0.485 | 0.493 | -0.227 | 0.904 | ... | -0.458 | -0.398 | 0.101 | 0.199 | 0.634 | 0.017 | -0.234 | 0.008 | 0.373 | 0.569 |
3 | 0.102 | 0.294 | -0.259 | 0.051 | -0.183 | 0.148 | 0.474 | 0.504 | 0.010 | 0.904 | ... | -0.456 | -0.398 | 1.007 | 0.137 | 1.042 | -0.040 | -0.290 | 0.008 | -0.666 | 0.391 |
4 | 0.300 | 0.428 | 0.208 | 0.051 | -0.033 | 0.116 | 0.408 | 0.497 | 0.155 | 0.904 | ... | -0.458 | -0.776 | 0.291 | 0.370 | 0.181 | -0.040 | -0.290 | 0.008 | -0.140 | -0.497 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1920 | -1.362 | -1.553 | -3.096 | -0.444 | 0.381 | 1.375 | -4.854 | -5.331 | -4.074 | -3.838 | ... | 0.525 | 0.171 | -4.488 | -5.793 | -4.050 | -1.187 | -0.852 | -2.131 | -2.564 | 0.597 |
1921 | -2.698 | -3.452 | -3.620 | -1.066 | -1.385 | 1.378 | -4.927 | -5.103 | -4.393 | -1.683 | ... | -0.446 | 1.297 | -0.613 | -7.698 | -0.674 | -1.187 | -0.852 | -2.131 | -2.564 | 1.215 |
1922 | -2.615 | -3.564 | -3.402 | -0.422 | -1.272 | 1.121 | -4.223 | -4.315 | -5.196 | -3.407 | ... | -0.447 | 0.552 | 0.125 | -6.111 | 0.275 | -1.851 | -1.548 | -1.537 | -2.544 | 1.612 |
1923 | -2.661 | -3.646 | -3.271 | -0.699 | -1.270 | 1.116 | -3.716 | -3.809 | -4.735 | -2.976 | ... | -0.447 | 0.318 | 1.086 | -5.268 | 0.683 | -1.645 | -1.471 | -1.537 | -2.549 | 1.431 |
1924 | -2.321 | -3.037 | -3.214 | -1.594 | -0.910 | 1.259 | -3.616 | -3.747 | -4.368 | -2.976 | ... | -0.442 | 0.323 | -0.774 | -5.211 | 1.618 | -1.703 | -1.471 | -1.537 | -1.123 | 1.988 |
1925 rows × 38 columns
# 这样做能够在索引的时候识别出train和test
data_train["oringin"]='train'
data_test["oringin"]='test'
data_all=pd.concat([data_train,data_test],axis=0,ignore_index=True)
data_all.head()
V0 | V1 | V2 | V3 | V4 | V5 | V6 | V7 | V8 | V9 | ... | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | target | oringin | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.566 | 0.016 | -0.143 | 0.407 | 0.452 | -0.901 | -1.812 | -2.360 | -0.436 | -2.114 | ... | 0.109 | -0.615 | 0.327 | -4.627 | -4.789 | -5.101 | -2.608 | -3.508 | 0.175 | train |
1 | 0.968 | 0.437 | 0.066 | 0.566 | 0.194 | -0.893 | -1.566 | -2.360 | 0.332 | -2.114 | ... | 0.124 | 0.032 | 0.600 | -0.843 | 0.160 | 0.364 | -0.335 | -0.730 | 0.676 | train |
2 | 1.013 | 0.568 | 0.235 | 0.370 | 0.112 | -0.797 | -1.367 | -2.360 | 0.396 | -2.114 | ... | 0.361 | 0.277 | -0.116 | -0.843 | 0.160 | 0.364 | 0.765 | -0.589 | 0.633 | train |
3 | 0.733 | 0.368 | 0.283 | 0.165 | 0.599 | -0.679 | -1.200 | -2.086 | 0.403 | -2.114 | ... | 0.417 | 0.279 | 0.603 | -0.843 | -0.065 | 0.364 | 0.333 | -0.112 | 0.206 | train |
4 | 0.684 | 0.638 | 0.260 | 0.209 | 0.337 | -0.454 | -1.073 | -2.086 | 0.314 | -2.114 | ... | 1.078 | 0.328 | 0.418 | -0.843 | -0.215 | 0.364 | -0.280 | -0.028 | 0.384 | train |
5 rows × 40 columns
数据分布查看
- 这里因为是传感器的数据,即连续变量,所以使用 kdeplot(核密度估计图) 进行数据的初步分析,即EDA。
for column in data_all.columns[0:-2]:
g=sns.kdeplot(data_all[column][data_all['oringin']=='train'],color='Red',shade=True)
g=sns.kdeplot(data_all[column][data_all['oringin']=='test'],ax=g,color='Blue',shade=True)
g.set_xlabel(column)
g.set_ylabel("Frequency")
g=g.legend(['Train','Test'],loc=2)
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-wuvSWEMa-1621777034424)(output_10_0.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dRETn60k-1621777034426)(output_10_1.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-z5rc3Nou-1621777034428)(output_10_2.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-LzOCDaam-1621777034429)(output_10_3.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rcYCxR7f-1621777034430)(output_10_4.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-SUOB1Bzy-1621777034430)(output_10_5.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dWrytqgD-1621777034431)(output_10_6.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-RXpC3Wt5-1621777034431)(output_10_7.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-rAJ7CgpJ-1621777034432)(output_10_8.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JH4zOnr1-1621777034432)(output_10_9.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-y1D5bOJ4-1621777034432)(output_10_10.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lR4uCCEO-1621777034433)(output_10_11.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-B5fZjJAt-1621777034433)(output_10_12.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-HbFJ96qx-1621777034434)(output_10_13.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JvIo2NP6-1621777034434)(output_10_14.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-00q5icxP-1621777034435)(output_10_15.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xLogYQF2-1621777034435)(output_10_16.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-mLeSHQxO-1621777034436)(output_10_17.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Jgw8lFCu-1621777034436)(output_10_18.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-3Kv7Ixkr-1621777034437)(output_10_19.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vsX5vAYb-1621777034437)(output_10_20.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-vk1rHbak-1621777034438)(output_10_21.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-KWspkzTC-1621777034438)(output_10_22.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-ExB8v58l-1621777034439)(output_10_23.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-uzFdhoMr-1621777034439)(output_10_24.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-pOks8Kwc-1621777034439)(output_10_25.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-EhdMbTfX-1621777034440)(output_10_26.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-aTbVKka3-1621777034440)(output_10_27.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-dVPdjcaD-1621777034440)(output_10_28.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-COH69zTs-1621777034441)(output_10_29.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-qPJ3QrwQ-1621777034441)(output_10_30.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-hc01tlQ1-1621777034442)(output_10_31.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-Knyi8aWC-1621777034442)(output_10_32.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-I9yBSgu4-1621777034443)(output_10_33.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-urgF9lDE-1621777034444)(output_10_34.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-po7sKRNS-1621777034444)(output_10_35.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-VKaCsEsn-1621777034445)(output_10_36.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-1RVslQvP-1621777034445)(output_10_37.svg)]
- 从上图可以看出,V5,V9,V11,V17,V22中训练集数据分布和测试集数据分布不均,所以我们删除这些特征数据
data_all.drop(['V5','V9','V11','V17','V22'],axis=1,inplace=True)
查看特征间的相关性
spearman方法
- 不用归一化,先查看相关性
relation_train=data_all[data_all['oringin']=='train'].drop("oringin",axis=1)
plt.figure(figsize=(20,16))
colnm=relation_train.columns.tolist()
mcorr=relation_train[colnm].corr(method='spearman')# 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵为bool型
mask[np.triu_indices_from(mask)] = True# 角分线右侧为True
cmap = sns.diverging_palette(220, 10, as_cmap=True) # 返回matplotlib colormap对象,调色板
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f') # 热力图(看两两相似度)
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lq16nHxU-1621777034446)(output_15_0.svg)]
# 进行降维操作,即将相关性的绝对值小于阈值的特征进行删除
threshold = 0.1
corr_matrix = relation_train.corr().abs()
drop_col=corr_matrix[corr_matrix["target"]<threshold].index
drop_col
Index(['V14', 'V21', 'V25', 'V26', 'V32', 'V33', 'V34'], dtype='object')
皮尔森方法
- 先进行归一化,再查看相关性
relation_train_1=data_all[data_all['oringin']=='train'].drop("oringin",axis=1)
cols_numeric=list(relation_train_1.columns)
def scale_minmax(col):
return (col-col.min())/(col.max()-col.min())
scale_cols = [col for col in cols_numeric if col!='target']
relation_train_1[scale_cols] = relation_train_1[scale_cols].apply(scale_minmax,axis=0)
relation_train_1[scale_cols].describe()
V0 | V1 | V2 | V3 | V4 | V6 | V7 | V8 | V10 | V12 | ... | V28 | V29 | V30 | V31 | V32 | V33 | V34 | V35 | V36 | V37 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | ... | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 | 2888.000000 |
mean | 0.690528 | 0.735521 | 0.593745 | 0.606301 | 0.639876 | 0.735418 | 0.741337 | 0.702012 | 0.353159 | 0.663280 | ... | 0.362048 | 0.401715 | 0.634030 | 0.760517 | 0.631794 | 0.459255 | 0.484465 | 0.734850 | 0.336306 | 0.527854 |
std | 0.143747 | 0.133738 | 0.145844 | 0.151302 | 0.119550 | 0.141872 | 0.137111 | 0.129082 | 0.130601 | 0.114305 | ... | 0.130861 | 0.141644 | 0.125338 | 0.110903 | 0.139979 | 0.099782 | 0.101353 | 0.122917 | 0.123733 | 0.153423 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.625465 | 0.695419 | 0.497279 | 0.515165 | 0.586328 | 0.659249 | 0.682314 | 0.653453 | 0.291813 | 0.606750 | ... | 0.278638 | 0.300053 | 0.586993 | 0.722656 | 0.565408 | 0.409037 | 0.454490 | 0.684936 | 0.279760 | 0.427112 |
50% | 0.727076 | 0.766264 | 0.609155 | 0.609933 | 0.652940 | 0.767115 | 0.774045 | 0.728557 | 0.369706 | 0.676042 | ... | 0.279764 | 0.385611 | 0.633755 | 0.782330 | 0.634615 | 0.454518 | 0.499949 | 0.755580 | 0.349860 | 0.519532 |
75% | 0.783922 | 0.812642 | 0.694342 | 0.714174 | 0.712185 | 0.835613 | 0.836958 | 0.781029 | 0.432054 | 0.739069 | ... | 0.445398 | 0.488154 | 0.694136 | 0.824949 | 0.714950 | 0.504261 | 0.511365 | 0.785260 | 0.414511 | 0.622210 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 33 columns
plt.figure(figsize=(20,16))
colnm=relation_train_1.columns.tolist()
mcorr=relation_train_1[colnm].corr(method='pearson')# 相关系数矩阵,即给出了任意两个变量之间的相关系数
mask = np.zeros_like(mcorr, dtype=np.bool) # 构造与mcorr同维数矩阵为bool型
mask[np.triu_indices_from(mask)] = True# 角分线右侧为True
cmap = sns.diverging_palette(220, 10, as_cmap=True) # 返回matplotlib colormap对象,调色板
g = sns.heatmap(mcorr, mask=mask, cmap=cmap, square=True, annot=True, fmt='0.2f') # 热力图(看两两相似度)
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-bx1l4Akf-1621777034446)(output_19_0.svg)]
threshold = 0.1
corr_matrix = relation_train_1.corr().abs()
drop_col=corr_matrix[corr_matrix["target"]<threshold].index
drop_col
Index(['V14', 'V21', 'V25', 'V26', 'V32', 'V33', 'V34'], dtype='object')
data_all.drop(drop_col,axis=1,inplace=True)
归一化
cols_numeric=list(data_all.columns)
cols_numeric.remove("oringin")
def scale_minmax(col):
return (col-col.min())/(col.max()-col.min())
scale_cols = [col for col in cols_numeric if col!='target']
data_all[scale_cols] = data_all[scale_cols].apply(scale_minmax,axis=0)
data_all[scale_cols].describe()
V0 | V1 | V2 | V3 | V4 | V6 | V7 | V8 | V10 | V12 | ... | V23 | V24 | V27 | V28 | V29 | V30 | V31 | V35 | V36 | V37 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | ... | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 | 4813.000000 |
mean | 0.694172 | 0.721357 | 0.602300 | 0.603139 | 0.523743 | 0.748823 | 0.745740 | 0.715607 | 0.348518 | 0.578507 | ... | 0.744438 | 0.356712 | 0.881401 | 0.342653 | 0.388683 | 0.589459 | 0.792709 | 0.762873 | 0.332385 | 0.545795 |
std | 0.144198 | 0.131443 | 0.140628 | 0.152462 | 0.106430 | 0.132560 | 0.132577 | 0.118105 | 0.134882 | 0.105088 | ... | 0.134085 | 0.265512 | 0.128221 | 0.140731 | 0.133475 | 0.130786 | 0.102976 | 0.102037 | 0.127456 | 0.150356 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
25% | 0.626676 | 0.679416 | 0.514414 | 0.503888 | 0.478182 | 0.683324 | 0.696938 | 0.664934 | 0.284327 | 0.532892 | ... | 0.719362 | 0.040616 | 0.888575 | 0.278778 | 0.292445 | 0.550092 | 0.761816 | 0.727273 | 0.270584 | 0.445647 |
50% | 0.729488 | 0.752497 | 0.617072 | 0.614270 | 0.535866 | 0.774125 | 0.771974 | 0.742884 | 0.366469 | 0.591635 | ... | 0.788817 | 0.381736 | 0.916015 | 0.279904 | 0.375734 | 0.594428 | 0.815055 | 0.800020 | 0.347056 | 0.539317 |
75% | 0.790195 | 0.799553 | 0.700464 | 0.710474 | 0.585036 | 0.842259 | 0.836405 | 0.790835 | 0.432965 | 0.641971 | ... | 0.792706 | 0.574728 | 0.932555 | 0.413031 | 0.471837 | 0.650798 | 0.852229 | 0.800020 | 0.414861 | 0.643061 |
max | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
8 rows × 26 columns
特征工程
- 绘图显示Box-Cox变换对数据分布影响,Box-Cox用于连续的响应变量不满足正态分布的情况。在进行Box-Cox变换之后,可以一定程度上减小不可观测的误差和预测变量的相关性。
fcols=6
frows=len(cols_numeric)-1
plt.figure(figsize=(4*fcols,4*frows))
i=0
for var in cols_numeric:
if var!='target':
dat=data_all[[var,'target']].dropna()
i+=1
plt.subplot(frows,fcols,i)
sns.distplot(dat[var] , fit=stats.norm)
plt.title(var+'Original')
plt.xlabel('')
i+=1
plt.subplot(frows,fcols,i)
_=stats.probplot(dat[var],plot=plt)
plt.title('skew='+'{:.4f}'.format(stats.skew(dat[var])))
plt.xlabel('')
plt.ylabel('')
i+=1
plt.subplot(frows,fcols,i)
plt.plot(dat[var], dat['target'],'.',alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[var], dat['target'])[0][1]))
i+=1
plt.subplot(frows,fcols,i)
trans_var, lambda_var = stats.boxcox(dat[var].dropna()+1)
trans_var = scale_minmax(trans_var)
sns.distplot(trans_var,fit=stats.norm)
plt.title(var+' Tramsformed')
plt.xlabel('')
i+=1
plt.subplot(frows,fcols,i)
_=stats.probplot(trans_var,plot=plt)
plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
plt.xlabel('')
plt.ylabel('')
i+=1
plt.subplot(frows,fcols,i)
plt.plot(trans_var, dat['target'],'.',alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var, dat['target'])[0][1]))
fcols = 6
frows = len(cols_numeric)-1
plt.figure(figsize=(4*fcols,4*frows))
i=0
for var in cols_numeric:
if var!='target':
dat = data_all[[var, 'target']].dropna()
i+=1
plt.subplot(frows,fcols,i)
sns.distplot(dat[var] , fit=stats.norm)
plt.title(var+' Original')
plt.xlabel('')
i+=1
plt.subplot(frows,fcols,i)
_=stats.probplot(dat[var], plot=plt)
plt.title('skew='+'{:.4f}'.format(stats.skew(dat[var])))
plt.xlabel('')
plt.ylabel('')
i+=1
plt.subplot(frows,fcols,i)
plt.plot(dat[var], dat['target'],'.',alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(dat[var], dat['target'])[0][1]))
i+=1
plt.subplot(frows,fcols,i)
trans_var, lambda_var = stats.boxcox(dat[var].dropna()+1)
trans_var = scale_minmax(trans_var)
sns.distplot(trans_var , fit=stats.norm)
plt.title(var+' Tramsformed')
plt.xlabel('')
i+=1
plt.subplot(frows,fcols,i)
_=stats.probplot(trans_var, plot=plt)
plt.title('skew='+'{:.4f}'.format(stats.skew(trans_var)))
plt.xlabel('')
plt.ylabel('')
i+=1
plt.subplot(frows,fcols,i)
plt.plot(trans_var, dat['target'],'.',alpha=0.5)
plt.title('corr='+'{:.2f}'.format(np.corrcoef(trans_var,dat['target'])[0][1]))