集成学习案例一(幸福感预测)
import os
import time
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron,SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error,mean_absolute_error, f1_score
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.ensemble import ExtraTreesRegressor as etr
from sklearn.ensemble import GradientBoostingRegressor as gbr
from sklearn.linear_model import BayesianRidge as br
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression as lr
from sklearn.linear_model import ElasticNet as en
from sklearn.kernel_ridge import KernelRidge as kr
from sklearn.model_selection import KFold,StratifiedKFold,GroupKFold,RepeatedStratifiedKFold,train_test_split,GridSearchCV
from sklearn import preprocessing
import logging
import warnings
warnings.filterwarnings('ignore') #消除warning
train=pd.read_csv(r'C:\Users\LiXiang\OneDrive\文档\WeChat Files\lx12633036\FileStorage\File\2021-05\CH6-集成学习之案例分享\集成学习案例分析1\train.csv',parse_dates=['survey_time'],encoding='gbk')
test=pd.read_csv(r'C:\Users\LiXiang\OneDrive\文档\WeChat Files\lx12633036\FileStorage\File\2021-05\CH6-集成学习之案例分享\集成学习案例分析1\test.csv',parse_dates=['survey_time'],encoding='gbk')
train
id | happiness | survey_type | province | city | county | survey_time | gender | birth | nationality | ... | neighbor_familiarity | public_service_1 | public_service_2 | public_service_3 | public_service_4 | public_service_5 | public_service_6 | public_service_7 | public_service_8 | public_service_9 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 4 | 1 | 12 | 32 | 59 | 2015-08-04 14:18:00 | 1 | 1959 | 1 | ... | 4 | 50 | 60 | 50 | 50 | 30.0 | 30 | 50 | 50 | 50 |
1 | 2 | 4 | 2 | 18 | 52 | 85 | 2015-07-21 15:04:00 | 1 | 1992 | 1 | ... | 3 | 90 | 70 | 70 | 80 | 85.0 | 70 | 90 | 60 | 60 |
2 | 3 | 4 | 2 | 29 | 83 | 126 | 2015-07-21 13:24:00 | 2 | 1967 | 1 | ... | 4 | 90 | 80 | 75 | 79 | 80.0 | 90 | 90 | 90 | 75 |
3 | 4 | 5 | 2 | 10 | 28 | 51 | 2015-07-25 17:33:00 | 2 | 1943 | 1 | ... | 3 | 100 | 90 | 70 | 80 | 80.0 | 90 | 90 | 80 | 80 |
4 | 5 | 4 | 1 | 7 | 18 | 36 | 2015-08-10 09:50:00 | 2 | 1994 | 1 | ... | 2 | 50 | 50 | 50 | 50 | 50.0 | 50 | 50 | 50 | 50 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7995 | 7996 | 2 | 2 | 29 | 82 | 124 | 2015-07-21 19:36:00 | 1 | 1981 | 1 | ... | 3 | 40 | 50 | 50 | 50 | 40.0 | 50 | 50 | 60 | 50 |
7996 | 7997 | 3 | 1 | 12 | 32 | 61 | 2015-07-31 16:00:00 | 2 | 1945 | 1 | ... | 4 | 80 | 80 | 80 | 80 | 80.0 | 60 | 60 | 80 | 80 |
7997 | 7998 | 4 | 1 | 16 | 46 | 78 | 2015-08-01 17:48:00 | 2 | 1967 | 1 | ... | 4 | 75 | 70 | 70 | 80 | 80.0 | 70 | 75 | 70 | 75 |
7998 | 7999 | 3 | 1 | 1 | 1 | 8 | 2015-09-22 18:52:00 | 2 | 1978 | 1 | ... | 2 | 56 | 67 | 70 | 69 | 78.0 | 60 | 70 | 80 | 70 |
7999 | 8000 | 4 | 1 | 1 | 1 | 3 | 2015-09-28 20:22:00 | 2 | 1991 | 1 | ... | 3 | 80 | 80 | 80 | 80 | 80.0 | 80 | 80 | 80 | 80 |
8000 rows × 140 columns
train["happiness"].value_counts()
# 去除异常值
4 4818
5 1410
3 1159
2 497
1 104
-8 12
Name: happiness, dtype: int64
train = train[train["happiness"]!=-8].reset_index(drop=True)
train_data_copy = train.copy()
target_col = "happiness"
target = train_data_copy[target_col]
del train_data_copy[target_col]
#去除目标列
data = pd.concat([train_data_copy,test],axis=0,ignore_index=True)
数据预处理
for i in data.columns:
print(i)
id
survey_type
province
city
county
survey_time
gender
birth
nationality
religion
religion_freq
edu
edu_other
edu_status
edu_yr
income
political
join_party
floor_area
property_0
property_1
property_2
property_3
property_4
property_5
property_6
property_7
property_8
property_other
height_cm
weight_jin
health
health_problem
depression
hukou
hukou_loc
media_1
media_2
media_3
media_4
media_5
media_6
leisure_1
leisure_2
leisure_3
leisure_4
leisure_5
leisure_6
leisure_7
leisure_8
leisure_9
leisure_10
leisure_11
leisure_12
socialize
relax
learn
social_neighbor
social_friend
socia_outing
equity
class
class_10_before
class_10_after
class_14
work_exper
work_status
work_yr
work_type
work_manage
insur_1
insur_2
insur_3
insur_4
family_income
family_m
family_status
house
car
invest_0
invest_1
invest_2
invest_3
invest_4
invest_5
invest_6
invest_7
invest_8
invest_other
son
daughter
minor_child
marital
marital_1st
s_birth
marital_now
s_edu
s_political
s_hukou
s_income
s_work_exper
s_work_status
s_work_type
f_birth
f_edu
f_political
f_work_14
m_birth
m_edu
m_political
m_work_14
status_peer
status_3_before
view
inc_ability
inc_exp
trust_1
trust_2
trust_3
trust_4
trust_5
trust_6
trust_7
trust_8
trust_9
trust_10
trust_11
trust_12
trust_13
neighbor_familiarity
public_service_1
public_service_2
public_service_3
public_service_4
public_service_5
public_service_6
public_service_7
public_service_8
public_service_9
data.dtypes[data.dtypes=='object']
edu_other object
property_other object
invest_other object
dtype: object
data.columns
Index(['id', 'survey_type', 'province', 'city', 'county', 'survey_time',
'gender', 'birth', 'nationality', 'religion',
...
'neighbor_familiarity', 'public_service_1', 'public_service_2',
'public_service_3', 'public_service_4', 'public_service_5',
'public_service_6', 'public_service_7', 'public_service_8',
'public_service_9'],
dtype='object', length=139)
对年龄进行分组
首先是将“连续”的年龄,进行分层处理,即划分年龄段,具体地在这里我们将年龄分为了6个区间。其次是计算具体的年龄,在Excel表格中,只有出生年月以及调查时间等信息,我们根据此计算出每一位调查者的真实年龄。
data['survey_time'] = pd.to_datetime(data['survey_time'], format='%Y-%m-%d',errors='coerce')#防止时间格式不同的报错errors='coerce‘
data['survey_time'] = data['survey_time'].dt.year # 仅仅是year,方便计算年龄
data['age'] = data['survey_time']-data['birth']
# print(data['age'],data['survey_time'],data['birth'])
#年龄分层 145+1=146
bins = [0,17,26,34,50,63,100]
data['age_bin'] = pd.cut(data['age'], bins, labels=[0,1,2,3,4,5])
处理异常值
- 对于含有负数的值进行处理
- 对缺失值进行处理
处理负数
- 想法是将所有负值替换为特征的均值或者中位数
def getres1(row):
return len([x for x in row.values if type(x)==int and x <0])
# 负数分布-行
data[data.columns].apply(lambda row:getres1(row),axis=1)
0 5
1 0
2 3
3 2
4 2
..
10951 0
10952 2
10953 8
10954 4
10955 11
Length: 10956, dtype: int64
data[data.columns].apply(lambda row:getres1(row),axis=1).value_counts()
0 2006
2 1868
3 1397
1 1350
4 1055
5 913
6 676
7 457
8 359
9 226
10 176
11 100
12 68
13 66
14 60
15 45
16 24
18 23
17 21
20 13
19 12
21 12
22 5
23 5
24 5
27 4
28 2
25 2
29 1
26 1
30 1
33 1
48 1
31 1
dtype: int64
# 负数分布-列
list_neg_column=[]
for i in range(139):
if i!=5 and i!=12 and i!=89 and i!=28 and i!=88:
print(data.columns[i],' ',(data.iloc[:,i]<0).sum())
id 0
survey_type 0
province 0
city 0
county 0
gender 0
birth 0
nationality 20
religion 142
religion_freq 22
edu 17
edu_status 36
edu_yr 1673
income 598
political 45
join_party 120
floor_area 0
property_0 0
property_1 0
property_2 0
property_3 0
property_4 0
property_5 0
property_6 0
property_7 0
property_8 0
height_cm 0
weight_jin 0
health 5
health_problem 50
depression 24
hukou 0
hukou_loc 0
media_1 1
media_2 3
media_3 3
media_4 3
media_5 16
media_6 19
leisure_1 5
leisure_2 19
leisure_3 19
leisure_4 14
leisure_5 90
leisure_6 19
leisure_7 42
leisure_8 22
leisure_9 25
leisure_10 45
leisure_11 46
leisure_12 35
socialize 4
relax 20
learn 26
social_neighbor 14
social_friend 58
socia_outing 67
equity 61
class 101
class_10_before 143
class_10_after 632
class_14 186
work_exper 0
work_status 34
work_yr 207
work_type 91
work_manage 85
insur_1 50
insur_2 160
insur_3 217
insur_4 245
family_income 922
family_m 32
family_status 60
house 171
car 11
invest_0 0
invest_1 0
invest_2 0
invest_3 0
invest_4 0
invest_5 0
invest_6 0
invest_7 0
invest_8 0
daughter 15
minor_child 11
marital 0
marital_1st 766
s_birth 0
marital_now 486
s_edu 42
s_political 22
s_hukou 21
s_income 688
s_work_exper 0
s_work_status 38
s_work_type 56
f_birth 4654
f_edu 831
f_political 280
f_work_14 415
m_birth 4438
m_edu 680
m_political 210
m_work_14 402
status_peer 62
status_3_before 55
view 300
inc_ability 1319
inc_exp 1494
trust_1 107
trust_2 311
trust_3 1393
trust_4 1407
trust_5 61
trust_6 1559
trust_7 307
trust_8 1378
trust_9 644
trust_10 2447
trust_11 5450
trust_12 3377
trust_13 211
neighbor_familiarity 11
public_service_1 316
public_service_2 230
public_service_3 586
public_service_4 442
public_service_5 508
public_service_6 319
public_service_7 403
public_service_8 583
public_service_9 402
# 针对nationality-全部修改为汉族
data['nationality'][data['nationality']<0]=1
# 针对religion-全部修改为1
data['religion'][data['religion']<0]=1
#对‘宗教’处理
data.loc[data['religion']<0,'religion'] = 1 #1为不信仰宗教
data.loc[data['religion_freq']<0,'religion_freq'] = 1 #1为从来没有参加过
#对‘教育程度’处理
data.loc[data['edu']<0,'edu'] = 4 #初中
data.loc[data['edu_status']<0,'edu_status'] = 0
data.loc[data['edu_yr']<0,'edu_yr'] = 0
#对‘个人收入’处理
data.loc[data['income']<0,'income'] = 0 #认为无收入
#对‘政治面貌’处理
data.loc[data['political']<0,'political'] = 1 #认为是群众
#对体重处理
data.loc[(data['weight_jin']<=80)&(data['height_cm']>=160),'weight_jin']= data['weight_jin']*2
data.loc[data['weight_jin']<=60,'weight_jin']= data['weight_jin']*2 #个人的想法,没有60斤的成年人
#对身高处理
data.loc[data['height_cm']<150,'height_cm'] = 150 #成年人的实际情况
#对‘健康’处理
data.loc[data['health']<0,'health'] = 4 #认为是比较健康
data.loc[data['health_problem']<0,'health_problem'] = 4
#对‘沮丧’处理
data.loc[data['depression']<0,'depression'] = 4
#对‘媒体’处理
data.loc[data['media_1']<0,'media_1'] = 1 #都是从不
data.loc[data['media_2']<0,'media_2'] = 1
data.loc[data['media_3']<0,'media_3'] = 1
data.loc[data['media_4']<0,'media_4'] = 1
data.loc[data['media_5']<0,'media_5'] = 1
data.loc[data['media_6']<0,'media_6'] = 1
#对‘空闲活动’处理
data.loc[data['leisure_1']<0,'leisure_1'] = 1 #可调其他值
data.loc[data['leisure_2']<0,'leisure_2'] = 5
data.loc[data['leisure_3']<0,'leisure_3'] = 3
data.loc[data['leisure_4']<0,'leisure_4'] = data['leisure_4'].mode() #取众数
data.loc[data['leisure_5']<0,'leisure_5'] = data['leisure_5'].mode()
data.loc[data['leisure_6']<0,'leisure_6'] = data['leisure_6'].mode()
data.loc[data['leisure_7']<0,'leisure_7'] = data['leisure_7'].mode()
data.loc[data['leisure_8']<0,'leisure_8'] = data['leisure_8'].mode()
data.loc[data['leisure_9']<0,'leisure_9'] = data['leisure_9'].mode()
data.loc[data['leisure_10']<0,'leisure_10'] = data['leisure_10'].mode()
data.loc[data['leisure_11']<0,'leisure_11'] = data['leisure_11'].mode()
data.loc[data['leisure_12']<0,'leisure_12'] = data['leisure_12'].mode()
data.loc[data['socialize']<0,'socialize'] = 2 #很少
data.loc[data['relax']<0,'relax'] = 4 #经常
data.loc[data['learn']<0,'learn'] = 1 #从不
#对‘社交’处理
data.loc[data['social_neighbor']<0,'social_neighbor'] = 0
data.loc[data['social_friend']<0,'social_friend'] = 0
data.loc[data['socia_outing']<0,'socia_outing'] = 1
data.loc[data['neighbor_familiarity']<0,'social_neighbor']= 4
#对‘社会公平性’处理
data.loc[data['equity']<0,'equity'] = 4
#对‘社会等级’处理
data.loc[data['class_10_before']<0,'class_10_before'] = 3
data.loc[data['class']<0,'class'] = 5
data.loc[data['class_10_after']<0,'class_10_after'] = 5
data.loc[data['class_14']<0,'class_14'] = 2
#对‘工作情况’处理
data.loc[data['work_status']<0,'work_status'] = 0
data.loc[data['work_yr']<0,'work_yr'] = 0
data.loc[data['work_manage']<0,'work_manage'] = 0
data.loc[data['work_type']<0,'work_type'] = 0
#对‘社会保障’处理
data.loc[data['insur_1']<0,'insur_1'] = 1
data.loc[data['insur_2']<0,'insur_2'] = 1
data.loc[data['insur_3']<0,'insur_3'] = 1
data.loc[data['insur_4']<0,'insur_4'] = 1
data.loc[data['insur_1']==0,'insur_1'] = 0
data.loc[data['insur_2']==0,'insur_2'] = 0
data.loc[data['insur_3']==0,'insur_3'] = 0
data.loc[data['insur_4']==0,'insur_4'] = 0
处理缺失值
list_null=data.columns[data.isnull().sum()>0].tolist()
list_null
['edu_other',
'edu_status',
'edu_yr',
'join_party',
'property_other',
'hukou_loc',
'social_neighbor',
'social_friend',
'work_status',
'work_yr',
'work_type',
'work_manage',
'family_income',
'invest_other',
'minor_child',
'marital_1st',
's_birth',
'marital_now',
's_edu',
's_political',
's_hukou',
's_income',
's_work_exper',
's_work_status',
's_work_type']
#对家庭情况处理
family_income_mean = data['family_income'].mean()
data.loc[data['family_income']<0,'family_income'] = family_income_mean
data.loc[data['family_m']<0,'family_m'] = 2
data.loc[data['family_status']<0,'family_status'] = 3
data.loc[data['house']<0,'house'] = 1
data.loc[data['car']<0,'car'] = 0
data.loc[data['car']==2,'car'] = 0 #变为0和1
data.loc[data['son']<0,'son'] = 1
data.loc[data['daughter']<0,'daughter'] = 0
data.loc[data['minor_child']<0,'minor_child'] = 0
#对‘婚姻’处理
data.loc[data['marital_1st']<0,'marital_1st'] = 0
data.loc[data['marital_now']<0,'marital_now'] = 0
#对‘配偶’处理
data.loc[data['s_birth']<0,'s_birth'] = 0
data.loc[data['s_edu']<0,'s_edu'] = 0
data.loc[data['s_political']<0,'s_political'] = 0
data.loc[data['s_hukou']<0,'s_hukou'] = 0
data.loc[data['s_income']<0,'s_income'] = 0
data.loc[data['s_work_type']<0,'s_work_type'] = 0
data.loc[data['s_work_status']<0,'s_work_status'] = 0
data.loc[data['s_work_exper']<0,'s_work_exper'] = 0
#对‘父母情况’处理
data.loc[data['f_birth']<0,'f_birth'] = 1945
data.loc[data['f_edu']<0,'f_edu'] = 1
data.loc[data['f_political']<0,'f_political'] = 1
data.loc[data['f_work_14']<0,'f_work_14'] = 2
data.loc[data['m_birth']<0,'m_birth'] = 1940
data.loc[data['m_edu']<0,'m_edu'] = 1
data.loc[data['m_political']<0,'m_political'] = 1
data.loc[data['m_work_14']<0,'m_work_14'] = 2
#和同龄人相比社会经济地位
data.loc[data['status_peer']<0,'status_peer'] = 2
#和3年前比社会经济地位
data.loc[data['status_3_before']<0,'status_3_before'] = 2
#对‘观点’处理
data.loc[data['view']<0,'view'] = 4
#对期望年收入处理
data.loc[data['inc_ability']<=0,'inc_ability']= 2
inc_exp_mean = data['inc_exp'].mean()
data.loc[data['inc_exp']<=0,'inc_exp']= inc_exp_mean #取均值
#部分特征处理,取众数(首先去除缺失值的数据)
for i in range(1,9+1):
data.loc[data['public_service_'+str(i)]<0,'public_service_'+str(i)] = int(data['public_service_'+str(i)].dropna().mode().values)
for i in range(1,13+1):
data.loc[data['trust_'+str(i)]<0,'trust_'+str(i)] = int(data['trust_'+str(i)].dropna().mode().values)
- 因为家庭的收入是连续值,使用均值进行缺失值的补全。针对其他特征需要使用日常生活中的真实情况,例如“宗教信息”特征为负数的认为是“不信仰宗教”,并认为“参加宗教活动的频率”为1,即没有参加过宗教活动,主观的进行补全
特征工程
数据增广
- 这一步,我们需要进一步分析每一个特征之间的关系,从而进行数据增广。经过思考,这里我添加了如下的特征:第一次结婚年龄、最近结婚年龄、是否再婚、配偶年龄、配偶年龄差、各种收入比(与配偶之间的收入比、十年后预期收入与现在收入之比等等)、收入与住房面积比(其中也包括10年后期望收入等等各种情况)、社会阶级(10年后的社会阶级、14年后的社会阶级等等)、悠闲指数、满意指数、信任指数等等。除此之外,我还考虑了对于同一省、市、县进行了归一化。例如同一省市内的收入的平均值等以及一个个体相对于同省、市、县其他人的各个指标的情况。同时也考虑了对于同龄人之间的相互比较,即在同龄人中的收入情况、健康情况等等。
data.head()
id | survey_type | province | city | county | survey_time | gender | birth | nationality | religion | ... | public_service_2 | public_service_3 | public_service_4 | public_service_5 | public_service_6 | public_service_7 | public_service_8 | public_service_9 | age | age_bin | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 12 | 32 | 59 | 2015 | 1 | 1959 | 1 | 1 | ... | 60.0 | 50 | 50 | 30.0 | 30 | 50 | 50 | 50 | 56 | 4 |
1 | 2 | 2 | 18 | 52 | 85 | 2015 | 1 | 1992 | 1 | 1 | ... | 70.0 | 70 | 80 | 85.0 | 70 | 90 | 60 | 60 | 23 | 1 |
2 | 3 | 2 | 29 | 83 | 126 | 2015 | 2 | 1967 | 1 | 0 | ... | 80.0 | 75 | 79 | 80.0 | 90 | 90 | 90 | 75 | 48 | 3 |
3 | 4 | 2 | 10 | 28 | 51 | 2015 | 2 | 1943 | 1 | 1 | ... | 90.0 | 70 | 80 | 80.0 | 90 | 90 | 80 | 80 | 72 | 5 |
4 | 5 | 1 | 7 | 18 | 36 | 2015 | 2 | 1994 | 1 | 1 | ... | 50.0 | 50 | 50 | 50.0 | 50 | 50 | 50 | 50 | 21 | 1 |
5 rows × 141 columns
#第一次结婚年龄 147
data['marital_1stbir'] = data['marital_1st'] - data['birth']
#最近结婚年龄 148
data['marital_nowtbir'] = data['marital_now'] - data['birth']
#是否再婚 149
data['mar'] = data['marital_nowtbir'] - data['marital_1stbir']
#配偶年龄 150
data['marital_sbir'] = data['marital_now']-data['s_birth']
#配偶年龄差 151
data['age_'] = data['marital_nowtbir'] - data['marital_sbir']
#收入比 151+7 =158
data['income/s_income'] = data['income']/(data['s_income']+1) #同居伴侣
data['income+s_income'] = data['income']+(data['s_income']+1)
data['income/family_income'] = data['income']/(data['family_income']+1)
data['all_income/family_income'] = (data['income']+data['s_income'])/(data['family_income']+1)
data['income/inc_exp'] = data['income']/(data['inc_exp']+1)
data['family_income/m'] = data['family_income']/(data['family_m']+0.01)
data['income/m'] = data['income']/(data['family_m']+0.01)
#收入/面积比 158+4=162
data['income/floor_area'] = data['income']/(data['floor_area']+0.01)
data['all_income/floor_area'] = (data['income']+data['s_income'])/(data['floor_area']+0.01)
data['family_income/floor_area'] = data['family_income']/(data['floor_area']+0.01)
data['floor_area/m'] = data['floor_area']/(data['family_m']+0.01)
#class 162+3=165
data['class_10_diff'] = (data['class_10_after'] - data['class'])
data['class_diff'] = data['class'] - data['class_10_before']
data['class_14_diff'] = data['class'] - data['class_14']
#悠闲指数 166
leisure_fea_lis = ['leisure_'+str(i) for i in range(1,13)]
data['leisure_sum'] = data[leisure_fea_lis].sum(axis=1) #skew
#满意指数 167
public_service_fea_lis = ['public_service_'+str(i) for i in range(1,10)]
data['public_service_sum'] = data[public_service_fea_lis].sum(axis=1) #skew
#信任指数 168
trust_fea_lis = ['trust_'+str(i) for i in range(1,14)]
data['trust_sum'] = data[trust_fea_lis].sum(axis=1) #skew
#province mean 168+13=181
data['province_income_mean'] = data.groupby(['province'])['income'].transform('mean').values
data['province_family_income_mean'] = data.groupby(['province'])['family_income'].transform('mean').values
data['province_equity_mean'] = data.groupby(['province'])['equity'].transform('mean').values
data['province_depression_mean'] = data.groupby(['province'])['depression'].transform('mean').values
data['province_floor_area_mean'] = data.groupby(['province'])['floor_area'].transform('mean').values
data['province_health_mean'] = data.groupby(['province'])['health'].transform('mean').values
data['province_class_10_diff_mean'] = data.groupby(['province'])['class_10_diff'].transform('mean').values
data['province_class_mean'] = data.groupby(['province'])['class'].transform('mean').values
data['province_health_problem_mean'] = data.groupby(['province'])['health_problem'].transform('mean').values
data['province_family_status_mean'] = data.groupby(['province'])['family_status'].transform('mean').values
data['province_leisure_sum_mean'] = data.groupby(['province'])['leisure_sum'].transform('mean').values
data['province_public_service_sum_mean'] = data.groupby(['province'])['public_service_sum'].transform('mean').values
data['province_trust_sum_mean'] = data.groupby(['province'])['trust_sum'].transform('mean').values
#city mean 181+13=194
data['city_income_mean'] = data.groupby(['city'])['income'].transform('mean').values #按照city分组
data['city_family_income_mean'] = data.groupby(['city'])['family_income'].transform('mean').values
data['city_equity_mean'] = data.groupby(['city'])['equity'].transform('mean').values
data['city_depression_mean'] = data.groupby(['city'])['depression'].transform('mean').values
data['city_floor_area_mean'] = data.groupby(['city'])['floor_area'].transform('mean').values
data['city_health_mean'] = data.groupby(['city'])['health'].transform('mean').values
data['city_class_10_diff_mean'] = data.groupby(['city'])['class_10_diff'].transform('mean').values
data['city_class_mean'] = data.groupby(['city'])['class'].transform('mean').values
data['city_health_problem_mean'] = data.groupby(['city'])['health_problem'].transform('mean').values
data['city_family_status_mean'] = data.groupby(['city'])['family_status'].transform('mean').values
data['city_leisure_sum_mean'] = data.groupby(['city'])['leisure_sum'].transform('mean').values
data['city_public_service_sum_mean'] = data.groupby(['city'])['public_service_sum'].transform('mean').values
data['city_trust_sum_mean'] = data.groupby(['city'])['trust_sum'].transform('mean').values
#county mean 194 + 13 = 207
data['county_income_mean'] = data.groupby(['county'])['income'].transform('mean').values
data['county_family_income_mean'] = data.groupby(['county'])['family_income'].transform('mean').values
data['county_equity_mean'] = data.groupby(['county'])['equity'].transform('mean').values
data['county_depression_mean'] = data.groupby(['county'])['depression'].transform('mean').values
data['county_floor_area_mean'] = data.groupby(['county'])['floor_area'].transform('mean').values
data['county_health_mean'] = data.groupby(['county'])['health'].transform('mean').values
data['county_class_10_diff_mean'] = data.groupby(['county'])['class_10_diff'].transform('mean').values
data['county_class_mean'] = data.groupby(['county'])['class'].transform('mean').values
data['county_health_problem_mean'] = data.groupby(['county'])['health_problem'].transform('mean').values
data['county_family_status_mean'] = data.groupby(['county'])['family_status'].transform('mean').values
data['county_leisure_sum_mean'] = data.groupby(['county'])['leisure_sum'].transform('mean').values
data['county_public_service_sum_mean'] = data.groupby(['county'])['public_service_sum'].transform('mean').values
data['county_trust_sum_mean'] = data.groupby(['county'])['trust_sum'].transform('mean').values
#ratio 相比同省 207 + 13 =220
data['income/province'] = data['income']/(data['province_income_mean'])
data['family_income/province'] = data['family_income']/(data['province_family_income_mean'])
data['equity/province'] = data['equity']/(data['province_equity_mean'])
data['depression/province'] = data['depression']/(data['province_depression_mean'])
data['floor_area/province'] = data['floor_area']/(data['province_floor_area_mean'])
data['health/province'] = data['health']/(data['province_health_mean'])
data['class_10_diff/province'] = data['class_10_diff']/(data['province_class_10_diff_mean'])
data['class/province'] = data['class']/(data['province_class_mean'])
data['health_problem/province'] = data['health_problem']/(data['province_health_problem_mean'])
data['family_status/province'] = data['family_status']/(data['province_family_status_mean'])
data['leisure_sum/province'] = data['leisure_sum']/(data['province_leisure_sum_mean'])
data['public_service_sum/province'] = data['public_service_sum']/(data['province_public_service_sum_mean'])
data['trust_sum/province'] = data['trust_sum']/(data['province_trust_sum_mean']+1)
#ratio 相比同市 220 + 13 =233
data['income/city'] = data['income']/(data['city_income_mean'])
data['family_income/city'] = data['family_income']/(data['city_family_income_mean'])
data['equity/city'] = data['equity']/(data['city_equity_mean'])
data['depression/city'] = data['depression']/(data['city_depression_mean'])
data['floor_area/city'] = data['floor_area']/(data['city_floor_area_mean'])
data['health/city'] = data['health']/(data['city_health_mean'])
data['class_10_diff/city'] = data['class_10_diff']/(data['city_class_10_diff_mean'])
data['class/city'] = data['class']/(data['city_class_mean'])
data['health_problem/city'] = data['health_problem']/(data['city_health_problem_mean'])
data['family_status/city'] = data['family_status']/(data['city_family_status_mean'])
data['leisure_sum/city'] = data['leisure_sum']/(data['city_leisure_sum_mean'])
data['public_service_sum/city'] = data['public_service_sum']/(data['city_public_service_sum_mean'])
data['trust_sum/city'] = data['trust_sum']/(data['city_trust_sum_mean'])
#ratio 相比同个地区 233 + 13 =246
data['income/county'] = data['income']/(data['county_income_mean'])
data['family_income/county'] = data['family_income']/(data['county_family_income_mean'])
data['equity/county'] = data['equity']/(data['county_equity_mean'])
data['depression/county'] = data['depression']/(data['county_depression_mean'])
data['floor_area/county'] = data['floor_area']/(data['county_floor_area_mean'])
data['health/county'] = data['health']/(data['county_health_mean'])
data['class_10_diff/county'] = data['class_10_diff']/(data['county_class_10_diff_mean'])
data['class/county'] = data['class']/(data['county_class_mean'])
data['health_problem/county'] = data['health_problem']/(data['county_health_problem_mean'])
data['family_status/county'] = data['family_status']/(data['county_family_status_mean'])
data['leisure_sum/county'] = data['leisure_sum']/(data['county_leisure_sum_mean'])
data['public_service_sum/county'] = data['public_service_sum']/(data['county_public_service_sum_mean'])
data['trust_sum/county'] = data['trust_sum']/(data['county_trust_sum_mean'])
#与年龄有关的平均值 246+ 13 =259
data['age_income_mean'] = data.groupby(['age'])['income'].transform('mean').values
data['age_family_income_mean'] = data.groupby(['age'])['family_income'].transform('mean').values
data['age_equity_mean'] = data.groupby(['age'])['equity'].transform('mean').values
data['age_depression_mean'] = data.groupby(['age'])['depression'].transform('mean').values
data['age_floor_area_mean'] = data.groupby(['age'])['floor_area'].transform('mean').values
data['age_health_mean'] = data.groupby(['age'])['health'].transform('mean').values
data['age_class_10_diff_mean'] = data.groupby(['age'])['class_10_diff'].transform('mean').values
data['age_class_mean'] = data.groupby(['age'])['class'].transform('mean').values
data['age_health_problem_mean'] = data.groupby(['age'])['health_problem'].transform('mean').values
data['age_family_status_mean'] = data.groupby(['age'])['family_status'].transform('mean').values
data['age_leisure_sum_mean'] = data.groupby(['age'])['leisure_sum'].transform('mean').values
data['age_public_service_sum_mean'] = data.groupby(['age'])['public_service_sum'].transform('mean').values
data['age_trust_sum_mean'] = data.groupby(['age'])['trust_sum'].transform('mean').values
# 和同龄人相比259 + 13 =272
data['income/age'] = data['income']/(data['age_income_mean'])
data['family_income/age'] = data['family_income']/(data['age_family_income_mean'])
data['equity/age'] = data['equity']/(data['age_equity_mean'])
data['depression/age'] = data['depression']/(data['age_depression_mean'])
data['floor_area/age'] = data['floor_area']/(data['age_floor_area_mean'])
data['health/age'] = data['health']/(data['age_health_mean'])
data['class_10_diff/age'] = data['class_10_diff']/(data['age_class_10_diff_mean'])
data['class/age'] = data['class']/(data['age_class_mean'])
data['health_problem/age'] = data['health_problem']/(data['age_health_problem_mean'])
data['family_status/age'] = data['family_status']/(data['age_family_status_mean'])
data['leisure_sum/age'] = data['leisure_sum']/(data['age_leisure_sum_mean'])
data['public_service_sum/age'] = data['public_service_sum']/(data['age_public_service_sum_mean'])
data['trust_sum/age'] = data['trust_sum']/(data['age_trust_sum_mean'])
- 这一部分创造特征的过程需要自己好好理解!
data.shape
(10956, 267)
特征筛选
- 删去有效样本数很少的特征,例如负值太多的特征或者是缺失值太多的特征,这里一共删除了包括“目前的最高教育程度”在内的9类特征,得到了最终的258维的特征
#272-9=263
#删除数值特别少的和之前用过的特征
del_list=['id','survey_time','edu_other','invest_other','property_other','join_party','province','city','county']
use_feature = [clo for clo in data.columns if clo not in del_list]
data.fillna(0,inplace=True) #还是补0
train_shape = train.shape[0] #一共的数据量,训练集
features = data[use_feature].columns #删除后所有的特征
X_train_258 = data[:train_shape][use_feature].values
y_train = target
X_test_258 = data[train_shape:][use_feature].values
X_train_258.shape #最终一种258个特征
(7988, 258)
# 选择了最重要的49个特征,作为除了以上263维特征外的另外一组特征
imp_fea_49 = ['equity','depression','health','class','family_status','health_problem','class_10_after',
'equity/province','equity/city','equity/county',
'depression/province','depression/city','depression/county',
'health/province','health/city','health/county',
'class/province','class/city','class/county',
'family_status/province','family_status/city','family_status/county',
'family_income/province','family_income/city','family_income/county',
'floor_area/province','floor_area/city','floor_area/county',
'leisure_sum/province','leisure_sum/city','leisure_sum/county',
'public_service_sum/province','public_service_sum/city','public_service_sum/county',
'trust_sum/province','trust_sum/city','trust_sum/county',
'income/m','public_service_sum','class_diff','status_3_before','age_income_mean','age_floor_area_mean',
'weight_jin','height_cm',
'health/age','depression/age','equity/age','leisure_sum/age'
]
train_shape = train.shape[0]
X_train_49 = data[:train_shape][imp_fea_49].values
X_test_49 = data[train_shape:][imp_fea_49].values
X_train_49.shape #最重要的49个特征
(7988, 49)
特征编码
- onehot编码的离散变量进行one-hot编码,再合成为第三类特征,共383维
cat_fea = ['survey_type','gender','nationality','edu_status','political','hukou','hukou_loc','work_exper','work_status','work_type','work_manage','marital','s_political','s_hukou','s_work_exper','s_work_status','s_work_type','f_political','f_work_14','m_political','m_work_14']
noc_fea=[clo for clo in use_feature if clo not in cat_fea]
onehot_data=data[cat_fea].values
enc=preprocessing.OneHotEncoder(categories='auto')
oh_data=enc.fit_transform(onehot_data).toarray()
oh_data.shape
(10956, 141)
X_train_oh = oh_data[:train_shape,:]
X_test_oh = oh_data[train_shape:,:]
X_train_oh.shape #其中的训练集
X_train_378 = np.column_stack([data[:train_shape][noc_fea].values,X_train_oh])#先是noc,再是cat_fea
X_test_378 = np.column_stack([data[train_shape:][noc_fea].values,X_test_oh])
X_train_378.shape
(7988, 378)
总结
基于此,构建完成了三种特征工程(训练数据集)
其一是上面提取的最重要的49种特征,其中包括健康程度、社会阶级、在同龄人中的收入情况等等特征
其二是扩充后的258维特征(这里可以认为是初始特征)
其三是使用One-hot编码后的特征,这里要使用One-hot进行编码的原因在于,有部分特征为分离值,例如性别中男女,男为1,女为2,我们想使用One-hot将其变为男为0,女为1,来增强机器学习算法的鲁棒性能;再如民族这个特征,原本是1-56这56个数值,如果直接分类会让分类器的鲁棒性变差,所以使用One-hot编码将其变为6个特征进行非零即一的处理
特征建模
258维特征-LightGBM
lgb_258_param = {
'num_leaves': 7,
'min_data_in_leaf': 20, #叶子可能具有的最小记录数
'objective':'regression',
'max_depth': -1,
'learning_rate': 0.003,
"boosting": "gbdt", #用gbdt算法
"feature_fraction": 0.18, #例如 0.18时,意味着在每次迭代中随机选择18%的参数来建树
"bagging_freq": 1,
"bagging_fraction": 0.55, #每次迭代时用的数据比例
"bagging_seed": 14,
"metric": 'mse',
"lambda_l1": 0.1,
"lambda_l2": 0.2,
"verbosity": -1}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4) #交叉切分:5
oof_lgb_258 = np.zeros(len(X_train_258))
predictions_lgb_258 = np.zeros(len(X_test_258))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_258, y_train)):
print("fold n°{}".format(fold_+1))
trn_data = lgb.Dataset(X_train_258[trn_idx], y_train[trn_idx])
val_data = lgb.Dataset(X_train_258[val_idx], y_train[val_idx])#train:val=4:1
num_round = 10000
lgb_258 = lgb.train(lgb_258_param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 800)
oof_lgb_258[val_idx] = lgb_258.predict(X_train_258[val_idx], num_iteration=lgb_258.best_iteration)
predictions_lgb_258 += lgb_258.predict(X_test_258, num_iteration=lgb_258.best_iteration) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb_258, target)))
fold n°1
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.499818 valid_1's l2: 0.533221
[1000] training's l2: 0.451707 valid_1's l2: 0.499844
[1500] training's l2: 0.425898 valid_1's l2: 0.486767
[2000] training's l2: 0.408007 valid_1's l2: 0.480298
[2500] training's l2: 0.393733 valid_1's l2: 0.477141
[3000] training's l2: 0.381463 valid_1's l2: 0.475275
[3500] training's l2: 0.370826 valid_1's l2: 0.474055
[4000] training's l2: 0.361047 valid_1's l2: 0.472887
[4500] training's l2: 0.352115 valid_1's l2: 0.472506
[5000] training's l2: 0.343621 valid_1's l2: 0.471832
[5500] training's l2: 0.335582 valid_1's l2: 0.471131
[6000] training's l2: 0.328122 valid_1's l2: 0.470749
[6500] training's l2: 0.320892 valid_1's l2: 0.47059
[7000] training's l2: 0.313987 valid_1's l2: 0.470634
[7500] training's l2: 0.307389 valid_1's l2: 0.470533
Early stopping, best iteration is:
[6740] training's l2: 0.317573 valid_1's l2: 0.470414
fold n°2
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.504473 valid_1's l2: 0.514811
[1000] training's l2: 0.455052 valid_1's l2: 0.481234
[1500] training's l2: 0.428983 valid_1's l2: 0.468668
[2000] training's l2: 0.411119 valid_1's l2: 0.461711
[2500] training's l2: 0.397221 valid_1's l2: 0.457882
[3000] training's l2: 0.385201 valid_1's l2: 0.45503
[3500] training's l2: 0.374764 valid_1's l2: 0.453004
[4000] training's l2: 0.365132 valid_1's l2: 0.451299
[4500] training's l2: 0.356309 valid_1's l2: 0.450109
[5000] training's l2: 0.34796 valid_1's l2: 0.449312
[5500] training's l2: 0.339984 valid_1's l2: 0.44876
[6000] training's l2: 0.332445 valid_1's l2: 0.448203
[6500] training's l2: 0.325149 valid_1's l2: 0.447664
[7000] training's l2: 0.318186 valid_1's l2: 0.447208
[7500] training's l2: 0.31158 valid_1's l2: 0.446783
[8000] training's l2: 0.305225 valid_1's l2: 0.446317
[8500] training's l2: 0.299184 valid_1's l2: 0.446151
[9000] training's l2: 0.293248 valid_1's l2: 0.446027
[9500] training's l2: 0.287522 valid_1's l2: 0.445908
Early stopping, best iteration is:
[8848] training's l2: 0.295068 valid_1's l2: 0.445831
fold n°3
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.503579 valid_1's l2: 0.518038
[1000] training's l2: 0.455417 valid_1's l2: 0.480482
[1500] training's l2: 0.429912 valid_1's l2: 0.463958
[2000] training's l2: 0.412154 valid_1's l2: 0.455182
[2500] training's l2: 0.398137 valid_1's l2: 0.449975
[3000] training's l2: 0.386141 valid_1's l2: 0.446615
[3500] training's l2: 0.375558 valid_1's l2: 0.444606
[4000] training's l2: 0.365787 valid_1's l2: 0.442896
[4500] training's l2: 0.356847 valid_1's l2: 0.44197
[5000] training's l2: 0.348356 valid_1's l2: 0.441244
[5500] training's l2: 0.340203 valid_1's l2: 0.441077
[6000] training's l2: 0.332627 valid_1's l2: 0.440688
[6500] training's l2: 0.325309 valid_1's l2: 0.440257
[7000] training's l2: 0.318346 valid_1's l2: 0.440069
[7500] training's l2: 0.31163 valid_1's l2: 0.440109
[8000] training's l2: 0.305292 valid_1's l2: 0.440163
Early stopping, best iteration is:
[7305] training's l2: 0.314265 valid_1's l2: 0.439831
fold n°4
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.505109 valid_1's l2: 0.513266
[1000] training's l2: 0.455855 valid_1's l2: 0.477426
[1500] training's l2: 0.429525 valid_1's l2: 0.464624
[2000] training's l2: 0.411286 valid_1's l2: 0.458494
[2500] training's l2: 0.397074 valid_1's l2: 0.455342
[3000] training's l2: 0.385011 valid_1's l2: 0.453251
[3500] training's l2: 0.374451 valid_1's l2: 0.452055
[4000] training's l2: 0.364639 valid_1's l2: 0.450808
[4500] training's l2: 0.355638 valid_1's l2: 0.449982
[5000] training's l2: 0.347158 valid_1's l2: 0.449599
[5500] training's l2: 0.339228 valid_1's l2: 0.448976
[6000] training's l2: 0.331672 valid_1's l2: 0.448705
[6500] training's l2: 0.324403 valid_1's l2: 0.448291
[7000] training's l2: 0.317535 valid_1's l2: 0.44802
[7500] training's l2: 0.310971 valid_1's l2: 0.447916
[8000] training's l2: 0.304546 valid_1's l2: 0.447797
[8500] training's l2: 0.298446 valid_1's l2: 0.448099
Early stopping, best iteration is:
[7936] training's l2: 0.30535 valid_1's l2: 0.447657
fold n°5
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.502758 valid_1's l2: 0.519301
[1000] training's l2: 0.454472 valid_1's l2: 0.483804
[1500] training's l2: 0.428696 valid_1's l2: 0.470063
[2000] training's l2: 0.41049 valid_1's l2: 0.46336
[2500] training's l2: 0.396181 valid_1's l2: 0.459649
[3000] training's l2: 0.383888 valid_1's l2: 0.457967
[3500] training's l2: 0.373178 valid_1's l2: 0.45681
[4000] training's l2: 0.363201 valid_1's l2: 0.456368
[4500] training's l2: 0.354163 valid_1's l2: 0.455923
[5000] training's l2: 0.345734 valid_1's l2: 0.456248
Early stopping, best iteration is:
[4544] training's l2: 0.353382 valid_1's l2: 0.455761
CV score: 0.45189895
参数可视化
#---------------特征重要性
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100,默认为50
pd.set_option('max_colwidth',100)
df = pd.DataFrame(data[use_feature].columns.tolist(), columns=['feature'])
df['importance']=list(lgb_258.feature_importance())
df = df.sort_values(by='importance',ascending=False)
plt.figure(figsize=(14,28))
sns.barplot(x="importance", y="feature", data=df.head(50))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
plt.show()
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-IHy2X9I8-1621339633711)(output_43_0.svg)]
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-jArVnFez-1621339633713)(output_43_1.svg)]
258维特征-Xgboost
xgb_258_params = {'eta': 0.02, #lr
'max_depth': 6,
'min_child_weight':3,#最小叶子节点样本权重和
'gamma':0, #指定节点分裂所需的最小损失函数下降值。
'subsample': 0.7, #控制对于每棵树,随机采样的比例
'colsample_bytree': 0.3, #用来控制每棵随机采样的列数的占比 (每一列是一个特征)。
'lambda':2,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': True,
'nthread': -1}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
oof_xgb_258 = np.zeros(len(X_train_258))
predictions_xgb_258 = np.zeros(len(X_test_258))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_258, y_train)):
print("fold n°{}".format(fold_+1))
trn_data = xgb.DMatrix(X_train_258[trn_idx], y_train[trn_idx])
val_data = xgb.DMatrix(X_train_258[val_idx], y_train[val_idx])
watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
xgb_258 = xgb.train(dtrain=trn_data, num_boost_round=3000, evals=watchlist, early_stopping_rounds=600, verbose_eval=500, params=xgb_258_params)
oof_xgb_258[val_idx] = xgb_258.predict(xgb.DMatrix(X_train_258[val_idx]), ntree_limit=xgb_258.best_ntree_limit)
predictions_xgb_258 += xgb_258.predict(xgb.DMatrix(X_test_258), ntree_limit=xgb_258.best_ntree_limit) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb_258, target)))
fold n°1
[16:06:31] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:06:31] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.39998 valid_data-rmse:3.40011
[500] train-rmse:0.40983 valid_data-rmse:0.68528
[1000] train-rmse:0.27662 valid_data-rmse:0.68620
[1133] train-rmse:0.24855 valid_data-rmse:0.68668
fold n°2
[16:06:39] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:06:39] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.39998 valid_data-rmse:3.40038
[500] train-rmse:0.40892 valid_data-rmse:0.68206
[1000] train-rmse:0.27649 valid_data-rmse:0.68281
[1252] train-rmse:0.22529 valid_data-rmse:0.68423
fold n°3
[16:06:48] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:06:48] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.39997 valid_data-rmse:3.40034
[500] train-rmse:0.40853 valid_data-rmse:0.67091
[1000] train-rmse:0.27626 valid_data-rmse:0.67246
[1317] train-rmse:0.21417 valid_data-rmse:0.67286
fold n°4
[16:06:57] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:06:57] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.40002 valid_data-rmse:3.40040
[500] train-rmse:0.41018 valid_data-rmse:0.67207
[1000] train-rmse:0.28009 valid_data-rmse:0.66934
[1500] train-rmse:0.18878 valid_data-rmse:0.67045
[1663] train-rmse:0.16601 valid_data-rmse:0.67046
fold n°5
[16:07:09] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:07:09] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.40025 valid_data-rmse:3.40000
[500] train-rmse:0.41245 valid_data-rmse:0.66407
[1000] train-rmse:0.27879 valid_data-rmse:0.66627
[1001] train-rmse:0.27854 valid_data-rmse:0.66625
CV score: 0.45394497
随机森林 -258维度
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_rfr_258 = np.zeros(len(X_train_258))
predictions_rfr_258 = np.zeros(len(X_test_258))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_258, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_258[trn_idx]
tr_y = y_train[trn_idx]
rfr_258 = rfr(n_estimators=1600,max_depth=9, min_samples_leaf=9, min_weight_fraction_leaf=0.0,
max_features=0.25,verbose=1,n_jobs=-1) #并行化
#verbose = 0 为不在标准输出流输出日志信息
#verbose = 1 为输出进度条记录
#verbose = 2 为每个epoch输出一行记录
rfr_258.fit(tr_x,tr_y)
oof_rfr_258[val_idx] = rfr_258.predict(X_train_258[val_idx])
predictions_rfr_258 += rfr_258.predict(X_test_258) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_rfr_258, target)))
fold n°1
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done 26 tasks | elapsed: 0.2s
[Parallel(n_jobs=-1)]: Done 176 tasks | elapsed: 1.3s
[Parallel(n_jobs=-1)]: Done 426 tasks | elapsed: 3.1s
[Parallel(n_jobs=-1)]: Done 776 tasks | elapsed: 5.7s
[Parallel(n_jobs=-1)]: Done 1226 tasks | elapsed: 9.0s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 11.7s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done 26 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 176 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 426 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 776 tasks | elapsed: 0.1s
[Parallel(n_jobs=12)]: Done 1226 tasks | elapsed: 0.2s
[Parallel(n_jobs=12)]: Done 1600 out of 1600 | elapsed: 0.3s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done 26 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 176 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 426 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 776 tasks | elapsed: 0.1s
[Parallel(n_jobs=12)]: Done 1226 tasks | elapsed: 0.2s
[Parallel(n_jobs=12)]: Done 1600 out of 1600 | elapsed: 0.3s finished
fold n°2
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done 26 tasks | elapsed: 0.2s
[Parallel(n_jobs=-1)]: Done 176 tasks | elapsed: 1.3s
[Parallel(n_jobs=-1)]: Done 426 tasks | elapsed: 3.1s
[Parallel(n_jobs=-1)]: Done 776 tasks | elapsed: 5.7s
[Parallel(n_jobs=-1)]: Done 1226 tasks | elapsed: 8.9s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 11.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done 26 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 176 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 426 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 776 tasks | elapsed: 0.1s
[Parallel(n_jobs=12)]: Done 1226 tasks | elapsed: 0.2s
[Parallel(n_jobs=12)]: Done 1600 out of 1600 | elapsed: 0.2s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done 26 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 176 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 426 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 776 tasks | elapsed: 0.1s
[Parallel(n_jobs=12)]: Done 1226 tasks | elapsed: 0.2s
[Parallel(n_jobs=12)]: Done 1600 out of 1600 | elapsed: 0.3s finished
fold n°3
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done 26 tasks | elapsed: 0.2s
[Parallel(n_jobs=-1)]: Done 176 tasks | elapsed: 1.2s
[Parallel(n_jobs=-1)]: Done 426 tasks | elapsed: 3.1s
[Parallel(n_jobs=-1)]: Done 776 tasks | elapsed: 5.7s
[Parallel(n_jobs=-1)]: Done 1226 tasks | elapsed: 8.9s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 11.6s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done 26 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 176 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 426 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 776 tasks | elapsed: 0.1s
[Parallel(n_jobs=12)]: Done 1226 tasks | elapsed: 0.2s
[Parallel(n_jobs=12)]: Done 1600 out of 1600 | elapsed: 0.2s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done 26 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 176 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 426 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 776 tasks | elapsed: 0.1s
[Parallel(n_jobs=12)]: Done 1226 tasks | elapsed: 0.2s
[Parallel(n_jobs=12)]: Done 1600 out of 1600 | elapsed: 0.3s finished
fold n°4
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done 26 tasks | elapsed: 0.2s
[Parallel(n_jobs=-1)]: Done 176 tasks | elapsed: 1.2s
[Parallel(n_jobs=-1)]: Done 426 tasks | elapsed: 3.0s
[Parallel(n_jobs=-1)]: Done 776 tasks | elapsed: 5.5s
[Parallel(n_jobs=-1)]: Done 1226 tasks | elapsed: 8.7s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 11.5s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done 26 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 176 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 426 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 776 tasks | elapsed: 0.1s
[Parallel(n_jobs=12)]: Done 1226 tasks | elapsed: 0.2s
[Parallel(n_jobs=12)]: Done 1600 out of 1600 | elapsed: 0.3s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done 26 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 176 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 426 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 776 tasks | elapsed: 0.1s
[Parallel(n_jobs=12)]: Done 1226 tasks | elapsed: 0.2s
[Parallel(n_jobs=12)]: Done 1600 out of 1600 | elapsed: 0.3s finished
fold n°5
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done 26 tasks | elapsed: 0.2s
[Parallel(n_jobs=-1)]: Done 176 tasks | elapsed: 1.3s
[Parallel(n_jobs=-1)]: Done 426 tasks | elapsed: 3.1s
[Parallel(n_jobs=-1)]: Done 776 tasks | elapsed: 5.7s
[Parallel(n_jobs=-1)]: Done 1226 tasks | elapsed: 9.0s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 11.8s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done 26 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 176 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 426 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 776 tasks | elapsed: 0.1s
[Parallel(n_jobs=12)]: Done 1226 tasks | elapsed: 0.2s
[Parallel(n_jobs=12)]: Done 1600 out of 1600 | elapsed: 0.3s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done 26 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 176 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 426 tasks | elapsed: 0.0s
[Parallel(n_jobs=12)]: Done 776 tasks | elapsed: 0.1s
[Parallel(n_jobs=12)]: Done 1226 tasks | elapsed: 0.2s
[Parallel(n_jobs=12)]: Done 1600 out of 1600 | elapsed: 0.3s finished
CV score: 0.47827693