本次 DataWhale 第二十五期组队学习,其开源内容的链接为:https://github.com/datawhalechina/team-learning-data-mining/tree/master/EnsembleLearning
import os
import time
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve, mean_squared_error,mean_absolute_error, f1_score
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor as rfr
from sklearn.ensemble import ExtraTreesRegressor as etr
from sklearn.linear_model import BayesianRidge as br
from sklearn.ensemble import GradientBoostingRegressor as gbr
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression as lr
from sklearn.linear_model import ElasticNet as en
from sklearn.kernel_ridge import KernelRidge as kr
from sklearn.model_selection import KFold, StratifiedKFold,GroupKFold, RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
import logging
import warnings
warnings.filterwarnings('ignore') #消除warning
pd.set_option('max_columns',1000)
pd.set_option('max_row',5000)
pd.set_option("display.max_info_columns", 500)
pd.set_option('display.float_format', lambda x: '%.5f' % x)
导入数据集
train = pd.read_csv('train.csv', parse_dates=['survey_time'], encoding='latin-1')
test = pd.read_csv("test.csv", parse_dates=['survey_time'],encoding='latin-1') #latin-1向下兼容ASCII
train.shape, test.shape
((8000, 140), (2968, 139))
train.head()
id | happiness | survey_type | province | city | county | survey_time | gender | birth | nationality | religion | religion_freq | edu | edu_other | edu_status | edu_yr | income | political | join_party | floor_area | property_0 | property_1 | property_2 | property_3 | property_4 | property_5 | property_6 | property_7 | property_8 | property_other | height_cm | weight_jin | health | health_problem | depression | hukou | hukou_loc | media_1 | media_2 | media_3 | media_4 | media_5 | media_6 | leisure_1 | leisure_2 | leisure_3 | leisure_4 | leisure_5 | leisure_6 | leisure_7 | leisure_8 | leisure_9 | leisure_10 | leisure_11 | leisure_12 | socialize | relax | learn | social_neighbor | social_friend | socia_outing | equity | class | class_10_before | class_10_after | class_14 | work_exper | work_status | work_yr | work_type | work_manage | insur_1 | insur_2 | insur_3 | insur_4 | family_income | family_m | family_status | house | car | invest_0 | invest_1 | invest_2 | invest_3 | invest_4 | invest_5 | invest_6 | invest_7 | invest_8 | invest_other | son | daughter | minor_child | marital | marital_1st | s_birth | marital_now | s_edu | s_political | s_hukou | s_income | s_work_exper | s_work_status | s_work_type | f_birth | f_edu | f_political | f_work_14 | m_birth | m_edu | m_political | m_work_14 | status_peer | status_3_before | view | inc_ability | inc_exp | trust_1 | trust_2 | trust_3 | trust_4 | trust_5 | trust_6 | trust_7 | trust_8 | trust_9 | trust_10 | trust_11 | trust_12 | trust_13 | neighbor_familiarity | public_service_1 | public_service_2 | public_service_3 | public_service_4 | public_service_5 | public_service_6 | public_service_7 | public_service_8 | public_service_9 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 4 | 1 | 12 | 32 | 59 | 2015-08-04 14:18:00 | 1 | 1959 | 1 | 1 | 1 | 11 | NaN | 4.00000 | -2.00000 | 20000 | 1 | nan | 45.00000 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 176 | 155 | 3 | 2 | 5 | 5 | 2.00000 | 4 | 2 | 5 | 5 | 4 | 3 | 1 | 4 | 3 | 1 | 2 | 3 | 4 | 1 | 4 | 5 | 4 | 1 | 2 | 4 | 3 | 3.00000 | 3.00000 | 2 | 3 | 3 | 3 | 3 | 1 | 1 | 3.00000 | 30.00000 | 1.00000 | 2.00000 | 1 | 1 | 1 | 2 | 60000.00000 | 2 | 2 | 1 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 1 | 0 | 0.00000 | 3 | 1984.00000 | 1958.00000 | 1984.00000 | 6.00000 | 1.00000 | 5.00000 | 40000.00000 | 5.00000 | nan | nan | -2 | 4 | 4 | 1 | -2 | 4 | 1 | 1 | 3 | 2 | 4 | 3 | 50000.00000 | 4 | 2 | -8 | -8 | 5 | 3 | 2 | 3 | 4 | 3 | -8 | 4 | 1 | 4 | 50 | 60 | 50 | 50 | 30.00000 | 30 | 50 | 50 | 50 |
1 | 2 | 4 | 2 | 18 | 52 | 85 | 2015-07-21 15:04:00 | 1 | 1992 | 1 | 1 | 1 | 12 | NaN | 4.00000 | 2013.00000 | 20000 | 1 | nan | 110.00000 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | NaN | 170 | 110 | 5 | 4 | 3 | 1 | 1.00000 | 2 | 2 | 1 | 3 | 5 | 1 | 2 | 3 | 4 | 3 | 5 | 4 | 3 | 2 | 3 | 4 | 5 | 1 | 2 | 4 | 3 | 6.00000 | 2.00000 | 1 | 3 | 6 | 4 | 8 | 5 | 1 | 3.00000 | 2.00000 | 1.00000 | 3.00000 | 1 | 1 | 1 | 1 | 40000.00000 | 3 | 4 | 1 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 0 | 0 | nan | 1 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 1972 | 3 | 1 | 2 | 1973 | 3 | 1 | 2 | 1 | 1 | 4 | 2 | 50000.00000 | 5 | 4 | 4 | 3 | 5 | 3 | 3 | 3 | 2 | 3 | 3 | 3 | 2 | 3 | 90 | 70 | 70 | 80 | 85.00000 | 70 | 90 | 60 | 60 |
2 | 3 | 4 | 2 | 29 | 83 | 126 | 2015-07-21 13:24:00 | 2 | 1967 | 1 | 0 | 3 | 4 | NaN | 4.00000 | -2.00000 | 2000 | 1 | nan | 120.00000 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 160 | 122 | 4 | 4 | 5 | 1 | 1.00000 | 2 | 2 | 2 | 5 | 1 | 3 | 1 | 4 | 4 | 3 | 5 | 4 | 4 | 2 | 3 | 5 | 5 | 5 | 3 | 4 | 2 | 2.00000 | 5.00000 | 2 | 4 | 5 | 4 | 6 | 3 | 2 | nan | nan | nan | nan | 1 | 1 | 2 | 2 | 8000.00000 | 3 | 3 | 1 | 2 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 0 | 2 | 1.00000 | 3 | 1990.00000 | 1968.00000 | 1990.00000 | 3.00000 | 1.00000 | 1.00000 | 6000.00000 | 3.00000 | nan | nan | -2 | 1 | 1 | 2 | -2 | 1 | 1 | 2 | 2 | 1 | 4 | 2 | 80000.00000 | 3 | 3 | 3 | 3 | 4 | 3 | 3 | 3 | 3 | 3 | -8 | 3 | 1 | 4 | 90 | 80 | 75 | 79 | 80.00000 | 90 | 90 | 90 | 75 |
3 | 4 | 5 | 2 | 10 | 28 | 51 | 2015-07-25 17:33:00 | 2 | 1943 | 1 | 1 | 1 | 3 | NaN | 4.00000 | 1959.00000 | 6420 | 1 | nan | 78.00000 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | 163 | 170 | 4 | 4 | 4 | 1 | 2.00000 | 2 | 1 | 1 | 5 | 1 | 1 | 1 | 5 | 2 | 4 | 5 | 4 | 5 | 1 | 1 | 5 | 5 | 5 | 2 | 4 | 4 | 1.00000 | 6.00000 | 1 | 4 | 5 | 5 | 7 | 2 | 4 | nan | nan | nan | nan | 2 | 2 | 2 | 2 | 12000.00000 | 3 | 3 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 1 | 4 | 0.00000 | 7 | 1960.00000 | nan | nan | nan | nan | nan | nan | nan | nan | nan | -2 | 14 | 1 | 2 | -2 | 1 | 1 | 2 | 2 | 1 | 3 | 2 | 10000.00000 | 3 | 3 | 4 | 3 | 5 | 3 | 3 | 5 | 4 | 3 | 3 | 3 | 2 | 3 | 100 | 90 | 70 | 80 | 80.00000 | 90 | 90 | 80 | 80 |
4 | 5 | 4 | 1 | 7 | 18 | 36 | 2015-08-10 09:50:00 | 2 | 1994 | 1 | 1 | 1 | 12 | NaN | 1.00000 | 2014.00000 | -1 | 2 | nan | 70.00000 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | NaN | 165 | 110 | 5 | 5 | 3 | 2 | 3.00000 | 1 | 3 | 4 | 2 | 5 | 5 | 3 | 3 | 3 | 2 | 4 | 4 | 3 | 5 | 2 | 5 | 5 | 1 | 4 | 3 | 4 | 7.00000 | 5.00000 | 3 | 2 | 1 | 1 | 1 | 4 | 6 | nan | nan | nan | nan | 1 | 2 | 2 | 2 | -2.00000 | 4 | 3 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 0 | 0 | nan | 1 | nan | nan | nan | nan | nan | nan | nan | nan | nan | nan | 1970 | 6 | 1 | 10 | 1972 | 4 | 1 | 15 | 3 | 2 | 3 | -8 | 200000.00000 | 4 | 3 | 3 | 3 | 5 | 5 | 3 | 4 | 3 | 3 | 3 | 3 | 2 | 2 | 50 | 50 | 50 | 50 | 50.00000 | 50 | 50 | 50 | 50 |
train['happiness'].value_counts()
4 4818
5 1410
3 1159
2 497
1 104
-8 12
Name: happiness, dtype: int64
train = train[train["happiness"]!=-8].reset_index(drop=True)
train_data_copy = train.copy() #删去"happiness" 为-8的行
train_data_copy.shape
(7988, 140)
target_col = "happiness" #目标列
target = train_data_copy[target_col]
del train_data_copy[target_col] #去除目标列
data = pd.concat([train_data_copy,test],axis=0,ignore_index=True)
train.shape, target.shape, data.shape
((7988, 140), (7988,), (10956, 139))
train.happiness.describe() #数据的基本信息
count 7988.00000
mean 3.86793
std 0.81872
min 1.00000
25% 4.00000
50% 4.00000
75% 4.00000
max 5.00000
Name: happiness, dtype: float64
数据预处理
pd.Series(data.isna().sum()).sort_values(ascending=False)[:25]
edu_other 10950
invest_other 10911
property_other 10867
join_party 9831
s_work_type 7437
s_work_status 7437
work_status 6932
work_yr 6932
work_manage 6931
work_type 6931
edu_yr 2754
marital_now 2445
s_political 2365
s_hukou 2365
s_income 2365
s_birth 2365
s_edu 2365
s_work_exper 2365
edu_status 1569
minor_child 1447
marital_1st 1128
social_friend 1096
social_neighbor 1096
hukou_loc 4
family_income 1
dtype: int64
首先需要对于数据中的连续出现的负数值进行处理。由于数据中的负数值只有-1,-2,-3,-8这几种数值,所以它们进行分别的操作。
#make feature +5
#csv中有复数值:-1、-2、-3、-8,将他们视为有问题的特征,但是不删去
def getres1(row):
return len([x for x in row.values if type(x)==int and x<0])
def getres2(row):
return len([x for x in row.values if type(x)==int and x==-8])
def getres3(row):
return len([x for x in row.values if type(x)==int and x==-1])
def getres4(row):
return len([x for x in row.values if type(x)==int and x==-2])
def getres5(row):
return len([x for x in row.values if type(x)==int and x==-3])
#检查数据
data['neg1'] = data[data.columns].apply(lambda row:getres1(row),axis=1)
data.loc[data['neg1']>20,'neg1'] = 20 #平滑处理,最多出现20次
data['neg2'] = data[data.columns].apply(lambda row:getres2(row),axis=1)
data['neg3'] = data[data.columns].apply(lambda row:getres3(row),axis=1)
data['neg4'] = data[data.columns].apply(lambda row:getres4(row),axis=1)
data['neg5'] = data[data.columns].apply(lambda row:getres5(row),axis=1)
data['neg1'][:10] # 小于0的直接统计个数,>20 就处理为20
0 5
1 0
2 3
3 2
4 2
5 1
6 1
7 5
8 0
9 1
Name: neg1, dtype: int64
填充缺失值,在这里我采取的方式是将缺失值补全,使用fillna(value),其中value的数值根据具体的情况来确定。例如将大部分缺失信息认为是零,将家庭成员数认为是1,将家庭收入这个特征认为是66365,即所有家庭的收入平均值。
#填充缺失值 共25列 去掉4列 填充21列 -- 去掉这个操作没在此处进行
#以下的列都是缺省的,视情况填补
data['work_status'] = data['work_status'].fillna(0)
data['work_yr'] = data['work_yr'].fillna(0)
data['work_manage'] = data['work_manage'].fillna(0)
data['work_type'] = data['work_type'].fillna(0)
data['edu_yr'] = data['edu_yr'].fillna(0)
data['edu_status'] = data['edu_status'].fillna(0)
data['s_work_type'] = data['s_work_type'].fillna(0)
data['s_work_status'] = data['s_work_status'].fillna(0)
data['s_political'] = data['s_political'].fillna(0)
data['s_hukou'] = data['s_hukou'].fillna(0)
data['s_income'] = data['s_income'].fillna(0)
data['s_birth'] = data['s_birth'].fillna(0)
data['s_edu'] = data['s_edu'].fillna(0)
data['s_work_exper'] = data['s_work_exper'].fillna(0)
data['minor_child'] = data['minor_child'].fillna(0)
data['marital_now'] = data['marital_now'].fillna(0)
data['marital_1st'] = data['marital_1st'].fillna(0)
data['social_neighbor']=data['social_neighbor'].fillna(0)
data['social_friend']=data['social_friend'].fillna(0)
data['hukou_loc']=data['hukou_loc'].fillna(1) #最少为1,表示户口
data['family_income']=data['family_income'].fillna(66365) #删除问题值后的平均值
除此之外,还有特殊格式的信息需要另外处理,比如与时间有关的信息,这里主要分为两部分进行处理:首先是将“连续”的年龄,进行分层处理,即划分年龄段,具体地在这里我们将年龄分为了6个区间。其次是计算具体的年龄,在Excel表格中,只有出生年月以及调查时间等信息,我们根据此计算出每一位调查者的真实年龄。
#144+1 =145
#继续进行特殊的列进行数据处理
#读happiness_index.xlsx
data['survey_time'] = pd.to_datetime(data['survey_time'], format='%Y-%m-%d',errors='coerce')#防止时间格式不同的报错errors='coerce‘
data['survey_time'] = data['survey_time'].dt.year #仅仅是year,方便计算年龄
data['age'] = data['survey_time']-data['birth']
# print(data['age'],data['survey_time'],data['birth'])
#年龄分层 145+1=146
bins = [0,17,26,34,50,63,100]
data['age_bin'] = pd.cut(data['age'], bins, labels=[0,1,2,3,4,5])
data.shape
(10956, 146)
在这里因为家庭的收入是连续值,所以不能再使用取众数的方法进行处理,这里就直接使用了均值进行缺失值的补全。第三种方法是使用我们日常生活中的真实情况,例如“宗教信息”特征为负数的认为是“不信仰宗教”,并认为“参加宗教活动的频率”为1,即没有参加过宗教活动,主观的进行补全,这也是我在这一步骤中使用最多的一种方式。就像我自己填表一样,这里我全部都使用了我自己的想法进行缺省值的补全。
#对‘宗教’处理
data.loc[data['religion']<0,'religion'] = 1 #1为不信仰宗教
data.loc[data['religion_freq']<0,'religion_freq'] = 1 #1为从来没有参加过
#对‘教育程度’处理
data.loc[data['edu']<0,'edu'] = 4 #初中
data.loc[data['edu_status']<0,'edu_status'] = 0
data.loc[data['edu_yr']<0,'edu_yr'] = 0
#对‘个人收入’处理
data.loc[data['income']<0,'income'] = 0 #认为无收入
#对‘政治面貌’处理
data.loc[data['political']<0,'political'] = 1 #认为是群众
#对体重处理
data.loc[(data['weight_jin']<=80)&(data['height_cm']>=160),'weight_jin']= data['weight_jin']*2
data.loc[data['weight_jin']<=60,'weight_jin']= data['weight_jin']*2 #个人的想法,哈哈哈,没有60斤的成年人吧
#对身高处理
data.loc[data['height_cm']<150,'height_cm'] = 150 #成年人的实际情况
#对‘健康’处理
data.loc[data['health']<0,'health'] = 4 #认为是比较健康
data.loc[data['health_problem']<0,'health_problem'] = 4
#对‘沮丧’处理
data.loc[data['depression']<0,'depression'] = 4 #一般人都是很少吧
#对‘媒体’处理
data.loc[data['media_1']<0,'media_1'] = 1 #都是从不
data.loc[data['media_2']<0,'media_2'] = 1
data.loc[data['media_3']<0,'media_3'] = 1
data.loc[data['media_4']<0,'media_4'] = 1
data.loc[data['media_5']<0,'media_5'] = 1
data.loc[data['media_6']<0,'media_6'] = 1
#对‘空闲活动’处理 -- A30题目
data.loc[data['leisure_1']<0,'leisure_1'] = 1 # 都是根据自己的想法 -- 即为每天看电视
data.loc[data['leisure_2']<0,'leisure_2'] = 5 # 从不看电影
data.loc[data['leisure_3']<0,'leisure_3'] = 3 # 一月数次文化活动(看演出、展览等)
使用众数(代码中使用mode()来实现异常值的修正),由于这里的特征是空闲活动,所以采用众数对于缺失值进行处理比较合理。
data.loc[data['leisure_4']<0,'leisure_4'] = data['leisure_4'].mode() #取众数
data.loc[data['leisure_5']<0,'leisure_5'] = data['leisure_5'].mode()
data.loc[data['leisure_6']<0,'leisure_6'] = data['leisure_6'].mode()
data.loc[data['leisure_7']<0,'leisure_7'] = data['leisure_7'].mode()
data.loc[data['leisure_8']<0,'leisure_8'] = data['leisure_8'].mode()
data.loc[data['leisure_9']<0,'leisure_9'] = data['leisure_9'].mode()
data.loc[data['leisure_10']<0,'leisure_10'] = data['leisure_10'].mode()
data.loc[data['leisure_11']<0,'leisure_11'] = data['leisure_11'].mode()
data.loc[data['leisure_12']<0,'leisure_12'] = data['leisure_12'].mode()
data.loc[data['socialize']<0,'socialize'] = 2 #很少
data.loc[data['relax']<0,'relax'] = 4 #经常
data.loc[data['learn']<0,'learn'] = 1 #从不,哈哈哈哈
#对‘社交’处理
data.loc[data['social_neighbor']<0,'social_neighbor'] = 0
data.loc[data['social_friend']<0,'social_friend'] = 0
data.loc[data['socia_outing']<0,'socia_outing'] = 1
data.loc[data['neighbor_familiarity']<0,'social_neighbor']= 4
#对‘社会公平性’处理
data.loc[data['equity']<0,'equity'] = 4
#对‘社会等级’处理
data.loc[data['class_10_before']<0,'class_10_before'] = 3
data.loc[data['class']<0,'class'] = 5
data.loc[data['class_10_after']<0,'class_10_after'] = 5
data.loc[data['class_14']<0,'class_14'] = 2
#对‘工作情况’处理
data.loc[data['work_status']<0,'work_status'] = 0
data.loc[data['work_yr']<0,'work_yr'] = 0
data.loc[data['work_manage']<0,'work_manage'] = 0
data.loc[data['work_type']<0,'work_type'] = 0
#对‘社会保障’处理
data.loc[data['insur_1']<0,'insur_1'] = 1
data.loc[data['insur_2']<0,'insur_2'] = 1
data.loc[data['insur_3']<0,'insur_3'] = 1
data.loc[data['insur_4']<0,'insur_4'] = 1
data.loc[data['insur_1']==0,'insur_1'] = 0
data.loc[data['insur_2']==0,'insur_2'] = 0
data.loc[data['insur_3']==0,'insur_3'] = 0
data.loc[data['insur_4']==0,'insur_4'] = 0
取均值进行缺失值的补全(代码实现为means()),在这里因为家庭的收入是连续值,所以不能再使用取众数的方法进行处理,这里就直接使用了均值进行缺失值的补全。
#对家庭情况处理
family_income_mean = data['family_income'].mean()
data.loc[data['family_income']<0,'family_income'] = family_income_mean
data.loc[data['family_m']<0,'family_m'] = 2
data.loc[data['family_status']<0,'family_status'] = 3
data.loc[data['house']<0,'house'] = 1
data.loc[data['car']<0,'car'] = 0
data.loc[data['car']==2,'car'] = 0
data.loc[data['son']<0,'son'] = 1
data.loc[data['daughter']<0,'daughter'] = 0
data.loc[data['minor_child']<0,'minor_child'] = 0
#对‘婚姻’处理
data.loc[data['marital_1st']<0,'marital_1st'] = 0
data.loc[data['marital_now']<0,'marital_now'] = 0
#对‘配偶’处理
data.loc[data['s_birth']<0,'s_birth'] = 0
data.loc[data['s_edu']<0,'s_edu'] = 0
data.loc[data['s_political']<0,'s_political'] = 0
data.loc[data['s_hukou']<0,'s_hukou'] = 0
data.loc[data['s_income']<0,'s_income'] = 0
data.loc[data['s_work_type']<0,'s_work_type'] = 0
data.loc[data['s_work_status']<0,'s_work_status'] = 0
data.loc[data['s_work_exper']<0,'s_work_exper'] = 0
#对‘父母情况’处理
data.loc[data['f_birth']<0,'f_birth'] = 1945
data.loc[data['f_edu']<0,'f_edu'] = 1
data.loc[data['f_political']<0,'f_political'] = 1
data.loc[data['f_work_14']<0,'f_work_14'] = 2
data.loc[data['m_birth']<0,'m_birth'] = 1940
data.loc[data['m_edu']<0,'m_edu'] = 1
data.loc[data['m_political']<0,'m_political'] = 1
data.loc[data['m_work_14']<0,'m_work_14'] = 2
#和同龄人相比社会经济地位
data.loc[data['status_peer']<0,'status_peer'] = 2
#和3年前比社会经济地位
data.loc[data['status_3_before']<0,'status_3_before'] = 2
#对‘观点’处理
data.loc[data['view']<0,'view'] = 4
#对期望年收入处理
data.loc[data['inc_ability']<=0,'inc_ability']= 2
inc_exp_mean = data['inc_exp'].mean()
data.loc[data['inc_exp']<=0,'inc_exp']= inc_exp_mean #取均值
#部分特征处理,取众数
for i in range(1,9+1):
# 估计是改版了,这个
# data.loc[data['public_service_'+str(i)]<0,'public_service_'+str(i)] = data['public_service_'+str(i)].dropna().mode().values
data.loc[data['public_service_'+str(i)]<0,'public_service_'+str(i)] = data['public_service_'+str(i)].dropna().mode()
for i in range(1,13+1):
# data.loc[data['trust_'+str(i)]<0,'trust_'+str(i)] = data['trust_'+str(i)].dropna().mode().values
data.loc[data['trust_'+str(i)]<0,'trust_'+str(i)] = data['trust_'+str(i)].dropna().mode()
数据增广
这一步,我们需要进一步分析每一个特征之间的关系,从而进行数据增广。经过思考,这里我添加了如下的特征:第一次结婚年龄、最近结婚年龄、是否再婚、配偶年龄、配偶年龄差、各种收入比(与配偶之间的收入比、十年后预期收入与现在收入之比等等)、收入与住房面积比(其中也包括10年后期望收入等等各种情况)、社会阶级(10年后的社会阶级、14年后的社会阶级等等)、悠闲指数、满意指数、信任指数等等。除此之外,我还考虑了对于同一省、市、县进行了归一化。例如同一省市内的收入的平均值等以及一个个体相对于同省、市、县其他人的各个指标的情况。同时也考虑了对于同龄人之间的相互比较,即在同龄人中的收入情况、健康情况等等。
data['public_service_'+str(i)].dropna().mode()
0 80
dtype: int64
#第一次结婚年龄 147
data['marital_1stbir'] = data['marital_1st'] - data['birth']
#最近结婚年龄 148
data['marital_nowtbir'] = data['marital_now'] - data['birth']
#是否再婚 149
data['mar'] = data['marital_nowtbir'] - data['marital_1stbir']
#配偶年龄 150
data['marital_sbir'] = data['marital_now']-data['s_birth']
#配偶年龄差 151
data['age_'] = data['marital_nowtbir'] - data['marital_sbir']
#收入比 151+7 =158
data['income/s_income'] = data['income']/(data['s_income']+1)
data['income+s_income'] = data['income']+(data['s_income']+1)
data['income/family_income'] = data['income']/(data['family_income']+1)
data['all_income/family_income'] = (data['income']+data['s_income'])/(data['family_income']+1)
data['income/inc_exp'] = data['income']/(data['inc_exp']+1)
data['family_income/m'] = data['family_income']/(data['family_m']+0.01)
data['income/m'] = data['income']/(data['family_m']+0.01)
#收入/面积比 158+4=162
data['income/floor_area'] = data['income']/(data['floor_area']+0.01)
data['all_income/floor_area'] = (data['income']+data['s_income'])/(data['floor_area']+0.01)
data['family_income/floor_area'] = data['family_income']/(data['floor_area']+0.01)
data['floor_area/m'] = data['floor_area']/(data['family_m']+0.01)
#class 162+3=165
data['class_10_diff'] = (data['class_10_after'] - data['class'])
data['class_diff'] = data['class'] - data['class_10_before']
data['class_14_diff'] = data['class'] - data['class_14']
#悠闲指数 166
leisure_fea_lis = ['leisure_'+str(i) for i in range(1,13)]
data['leisure_sum'] = data[leisure_fea_lis].sum(axis=1) #skew
#满意指数 167
public_service_fea_lis = ['public_service_'+str(i) for i in range(1,10)]
data['public_service_sum'] = data[public_service_fea_lis].sum(axis=1) #skew
#信任指数 168
trust_fea_lis = ['trust_'+str(i) for i in range(1,14)]
data['trust_sum'] = data[trust_fea_lis].sum(axis=1) #skew
#province mean 168+13=181
data['province_income_mean'] = data.groupby(['province'])['income'].transform('mean').values
data['province_family_income_mean'] = data.groupby(['province'])['family_income'].transform('mean').values
data['province_equity_mean'] = data.groupby(['province'])['equity'].transform('mean').values
data['province_depression_mean'] = data.groupby(['province'])['depression'].transform('mean').values
data['province_floor_area_mean'] = data.groupby(['province'])['floor_area'].transform('mean').values
data['province_health_mean'] = data.groupby(['province'])['health'].transform('mean').values
data['province_class_10_diff_mean'] = data.groupby(['province'])['class_10_diff'].transform('mean').values
data['province_class_mean'] = data.groupby(['province'])['class'].transform('mean').values
data['province_health_problem_mean'] = data.groupby(['province'])['health_problem'].transform('mean').values
data['province_family_status_mean'] = data.groupby(['province'])['family_status'].transform('mean').values
data['province_leisure_sum_mean'] = data.groupby(['province'])['leisure_sum'].transform('mean').values
data['province_public_service_sum_mean'] = data.groupby(['province'])['public_service_sum'].transform('mean').values
data['province_trust_sum_mean'] = data.groupby(['province'])['trust_sum'].transform('mean').values
#city mean 181+13=194
data['city_income_mean'] = data.groupby(['city'])['income'].transform('mean').values
data['city_family_income_mean'] = data.groupby(['city'])['family_income'].transform('mean').values
data['city_equity_mean'] = data.groupby(['city'])['equity'].transform('mean').values
data['city_depression_mean'] = data.groupby(['city'])['depression'].transform('mean').values
data['city_floor_area_mean'] = data.groupby(['city'])['floor_area'].transform('mean').values
data['city_health_mean'] = data.groupby(['city'])['health'].transform('mean').values
data['city_class_10_diff_mean'] = data.groupby(['city'])['class_10_diff'].transform('mean').values
data['city_class_mean'] = data.groupby(['city'])['class'].transform('mean').values
data['city_health_problem_mean'] = data.groupby(['city'])['health_problem'].transform('mean').values
data['city_family_status_mean'] = data.groupby(['city'])['family_status'].transform('mean').values
data['city_leisure_sum_mean'] = data.groupby(['city'])['leisure_sum'].transform('mean').values
data['city_public_service_sum_mean'] = data.groupby(['city'])['public_service_sum'].transform('mean').values
data['city_trust_sum_mean'] = data.groupby(['city'])['trust_sum'].transform('mean').values
#county mean 194 + 13 = 207
data['county_income_mean'] = data.groupby(['county'])['income'].transform('mean').values
data['county_family_income_mean'] = data.groupby(['county'])['family_income'].transform('mean').values
data['county_equity_mean'] = data.groupby(['county'])['equity'].transform('mean').values
data['county_depression_mean'] = data.groupby(['county'])['depression'].transform('mean').values
data['county_floor_area_mean'] = data.groupby(['county'])['floor_area'].transform('mean').values
data['county_health_mean'] = data.groupby(['county'])['health'].transform('mean').values
data['county_class_10_diff_mean'] = data.groupby(['county'])['class_10_diff'].transform('mean').values
data['county_class_mean'] = data.groupby(['county'])['class'].transform('mean').values
data['county_health_problem_mean'] = data.groupby(['county'])['health_problem'].transform('mean').values
data['county_family_status_mean'] = data.groupby(['county'])['family_status'].transform('mean').values
data['county_leisure_sum_mean'] = data.groupby(['county'])['leisure_sum'].transform('mean').values
data['county_public_service_sum_mean'] = data.groupby(['county'])['public_service_sum'].transform('mean').values
data['county_trust_sum_mean'] = data.groupby(['county'])['trust_sum'].transform('mean').values
#ratio 相比同省 207 + 13 =220
data['income/province'] = data['income']/(data['province_income_mean'])
data['family_income/province'] = data['family_income']/(data['province_family_income_mean'])
data['equity/province'] = data['equity']/(data['province_equity_mean'])
data['depression/province'] = data['depression']/(data['province_depression_mean'])
data['floor_area/province'] = data['floor_area']/(data['province_floor_area_mean'])
data['health/province'] = data['health']/(data['province_health_mean'])
data['class_10_diff/province'] = data['class_10_diff']/(data['province_class_10_diff_mean'])
data['class/province'] = data['class']/(data['province_class_mean'])
data['health_problem/province'] = data['health_problem']/(data['province_health_problem_mean'])
data['family_status/province'] = data['family_status']/(data['province_family_status_mean'])
data['leisure_sum/province'] = data['leisure_sum']/(data['province_leisure_sum_mean'])
data['public_service_sum/province'] = data['public_service_sum']/(data['province_public_service_sum_mean'])
data['trust_sum/province'] = data['trust_sum']/(data['province_trust_sum_mean']+1)
#ratio 相比同市 220 + 13 =233
data['income/city'] = data['income']/(data['city_income_mean'])
data['family_income/city'] = data['family_income']/(data['city_family_income_mean'])
data['equity/city'] = data['equity']/(data['city_equity_mean'])
data['depression/city'] = data['depression']/(data['city_depression_mean'])
data['floor_area/city'] = data['floor_area']/(data['city_floor_area_mean'])
data['health/city'] = data['health']/(data['city_health_mean'])
data['class_10_diff/city'] = data['class_10_diff']/(data['city_class_10_diff_mean'])
data['class/city'] = data['class']/(data['city_class_mean'])
data['health_problem/city'] = data['health_problem']/(data['city_health_problem_mean'])
data['family_status/city'] = data['family_status']/(data['city_family_status_mean'])
data['leisure_sum/city'] = data['leisure_sum']/(data['city_leisure_sum_mean'])
data['public_service_sum/city'] = data['public_service_sum']/(data['city_public_service_sum_mean'])
data['trust_sum/city'] = data['trust_sum']/(data['city_trust_sum_mean'])
#ratio 相比同个地区 233 + 13 =246
data['income/county'] = data['income']/(data['county_income_mean'])
data['family_income/county'] = data['family_income']/(data['county_family_income_mean'])
data['equity/county'] = data['equity']/(data['county_equity_mean'])
data['depression/county'] = data['depression']/(data['county_depression_mean'])
data['floor_area/county'] = data['floor_area']/(data['county_floor_area_mean'])
data['health/county'] = data['health']/(data['county_health_mean'])
data['class_10_diff/county'] = data['class_10_diff']/(data['county_class_10_diff_mean'])
data['class/county'] = data['class']/(data['county_class_mean'])
data['health_problem/county'] = data['health_problem']/(data['county_health_problem_mean'])
data['family_status/county'] = data['family_status']/(data['county_family_status_mean'])
data['leisure_sum/county'] = data['leisure_sum']/(data['county_leisure_sum_mean'])
data['public_service_sum/county'] = data['public_service_sum']/(data['county_public_service_sum_mean'])
data['trust_sum/county'] = data['trust_sum']/(data['county_trust_sum_mean'])
#age mean 246+ 13 =259
data['age_income_mean'] = data.groupby(['age'])['income'].transform('mean').values
data['age_family_income_mean'] = data.groupby(['age'])['family_income'].transform('mean').values
data['age_equity_mean'] = data.groupby(['age'])['equity'].transform('mean').values
data['age_depression_mean'] = data.groupby(['age'])['depression'].transform('mean').values
data['age_floor_area_mean'] = data.groupby(['age'])['floor_area'].transform('mean').values
data['age_health_mean'] = data.groupby(['age'])['health'].transform('mean').values
data['age_class_10_diff_mean'] = data.groupby(['age'])['class_10_diff'].transform('mean').values
data['age_class_mean'] = data.groupby(['age'])['class'].transform('mean').values
data['age_health_problem_mean'] = data.groupby(['age'])['health_problem'].transform('mean').values
data['age_family_status_mean'] = data.groupby(['age'])['family_status'].transform('mean').values
data['age_leisure_sum_mean'] = data.groupby(['age'])['leisure_sum'].transform('mean').values
data['age_public_service_sum_mean'] = data.groupby(['age'])['public_service_sum'].transform('mean').values
data['age_trust_sum_mean'] = data.groupby(['age'])['trust_sum'].transform('mean').values
# 和同龄人相比259 + 13 =272
data['income/age'] = data['income']/(data['age_income_mean'])
data['family_income/age'] = data['family_income']/(data['age_family_income_mean'])
data['equity/age'] = data['equity']/(data['age_equity_mean'])
data['depression/age'] = data['depression']/(data['age_depression_mean'])
data['floor_area/age'] = data['floor_area']/(data['age_floor_area_mean'])
data['health/age'] = data['health']/(data['age_health_mean'])
data['class_10_diff/age'] = data['class_10_diff']/(data['age_class_10_diff_mean'])
data['class/age'] = data['class']/(data['age_class_mean'])
data['health_problem/age'] = data['health_problem']/(data['age_health_problem_mean'])
data['family_status/age'] = data['family_status']/(data['age_family_status_mean'])
data['leisure_sum/age'] = data['leisure_sum']/(data['age_leisure_sum_mean'])
data['public_service_sum/age'] = data['public_service_sum']/(data['age_public_service_sum_mean'])
data['trust_sum/age'] = data['trust_sum']/(data['age_trust_sum_mean'])
经过如上的操作后,最终我们的特征从一开始的131维,扩充为了272维的特征。接下来考虑特征工程、训练模型以及模型融合的工作。
print('shape',data.shape)
data.head()
shape (10956, 272)
id | survey_type | province | city | county | survey_time | gender | birth | nationality | religion | religion_freq | edu | edu_other | edu_status | edu_yr | income | political | join_party | floor_area | property_0 | property_1 | property_2 | property_3 | property_4 | property_5 | property_6 | property_7 | property_8 | property_other | height_cm | weight_jin | health | health_problem | depression | hukou | hukou_loc | media_1 | media_2 | media_3 | media_4 | media_5 | media_6 | leisure_1 | leisure_2 | leisure_3 | leisure_4 | leisure_5 | leisure_6 | leisure_7 | leisure_8 | leisure_9 | leisure_10 | leisure_11 | leisure_12 | socialize | relax | learn | social_neighbor | social_friend | socia_outing | equity | class | class_10_before | class_10_after | class_14 | work_exper | work_status | work_yr | work_type | work_manage | insur_1 | insur_2 | insur_3 | insur_4 | family_income | family_m | family_status | house | car | invest_0 | invest_1 | invest_2 | invest_3 | invest_4 | invest_5 | invest_6 | invest_7 | invest_8 | invest_other | son | daughter | minor_child | marital | marital_1st | s_birth | marital_now | s_edu | s_political | s_hukou | s_income | s_work_exper | s_work_status | s_work_type | f_birth | f_edu | f_political | f_work_14 | m_birth | m_edu | m_political | m_work_14 | status_peer | status_3_before | view | inc_ability | inc_exp | trust_1 | trust_2 | trust_3 | trust_4 | trust_5 | trust_6 | trust_7 | trust_8 | trust_9 | trust_10 | trust_11 | trust_12 | trust_13 | neighbor_familiarity | public_service_1 | public_service_2 | public_service_3 | public_service_4 | public_service_5 | public_service_6 | public_service_7 | public_service_8 | public_service_9 | neg1 | neg2 | neg3 | neg4 | neg5 | age | age_bin | marital_1stbir | marital_nowtbir | mar | marital_sbir | age_ | income/s_income | income+s_income | income/family_income | all_income/family_income | income/inc_exp | family_income/m | income/m | income/floor_area | all_income/floor_area | family_income/floor_area | floor_area/m | class_10_diff | class_diff | class_14_diff | leisure_sum | public_service_sum | trust_sum | province_income_mean | province_family_income_mean | province_equity_mean | province_depression_mean | province_floor_area_mean | province_health_mean | province_class_10_diff_mean | province_class_mean | province_health_problem_mean | province_family_status_mean | province_leisure_sum_mean | province_public_service_sum_mean | province_trust_sum_mean | city_income_mean | city_family_income_mean | city_equity_mean | city_depression_mean | city_floor_area_mean | city_health_mean | city_class_10_diff_mean | city_class_mean | city_health_problem_mean | city_family_status_mean | city_leisure_sum_mean | city_public_service_sum_mean | city_trust_sum_mean | county_income_mean | county_family_income_mean | county_equity_mean | county_depression_mean | county_floor_area_mean | county_health_mean | county_class_10_diff_mean | county_class_mean | county_health_problem_mean | county_family_status_mean | county_leisure_sum_mean | county_public_service_sum_mean | county_trust_sum_mean | income/province | family_income/province | equity/province | depression/province | floor_area/province | health/province | class_10_diff/province | class/province | health_problem/province | family_status/province | leisure_sum/province | public_service_sum/province | trust_sum/province | income/city | family_income/city | equity/city | depression/city | floor_area/city | health/city | class_10_diff/city | class/city | health_problem/city | family_status/city | leisure_sum/city | public_service_sum/city | trust_sum/city | income/county | family_income/county | equity/county | depression/county | floor_area/county | health/county | class_10_diff/county | class/county | health_problem/county | family_status/county | leisure_sum/county | public_service_sum/county | trust_sum/county | age_income_mean | age_family_income_mean | age_equity_mean | age_depression_mean | age_floor_area_mean | age_health_mean | age_class_10_diff_mean | age_class_mean | age_health_problem_mean | age_family_status_mean | age_leisure_sum_mean | age_public_service_sum_mean | age_trust_sum_mean | income/age | family_income/age | equity/age | depression/age | floor_area/age | health/age | class_10_diff/age | class/age | health_problem/age | family_status/age | leisure_sum/age | public_service_sum/age | trust_sum/age | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 1 | 12 | 32 | 59 | 1970 | 1 | 1959 | 1 | 1 | 1 | 11 | NaN | 4.00000 | 0.00000 | 20000 | 1 | nan | 45.00000 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 176 | 155 | 3 | 2 | 5 | 5 | 2.00000 | 4 | 2 | 5 | 5 | 4 | 3 | 1 | 4 | 3 | 1.00000 | 2.00000 | 3.00000 | 4.00000 | 1.00000 | 4.00000 | 5.00000 | 4.00000 | 1.00000 | 2 | 4 | 3 | 3.00000 | 3.00000 | 2 | 3 | 3 | 3 | 3 | 1 | 1 | 3.00000 | 30.00000 | 1.00000 | 2.00000 | 1 | 1 | 1 | 2 | 60000.00000 | 2 | 2 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 1 | 0 | 0.00000 | 3 | 1984.00000 | 1958.00000 | 1984.00000 | 6.00000 | 1.00000 | 5.00000 | 40000.00000 | 5.00000 | 0.00000 | 0.00000 | 1945 | 4 | 4 | 1 | 1940 | 4 | 1 | 1 | 3 | 2 | 4 | 3 | 50000.00000 | 4.00000 | 2.00000 | 4.00000 | 4.00000 | 5.00000 | 3.00000 | 2.00000 | 3.00000 | 4.00000 | 3.00000 | -8.00000 | 4.00000 | 1.00000 | 4 | 50.00000 | 60.00000 | 50.00000 | 50.00000 | 30.00000 | 30.00000 | 50.00000 | 50.00000 | 50.00000 | 5 | 3 | 0 | 2 | 0 | 11 | 0 | 25.00000 | 25.00000 | 0.00000 | 26.00000 | -1.00000 | 0.49999 | 60001.00000 | 0.33333 | 0.99998 | 0.39999 | 29850.74627 | 9950.24876 | 444.34570 | 1333.03710 | 1333.03710 | 22.38806 | 0 | 0 | 2 | 33.00000 | 420.00000 | 31.00000 | 61859.50570 | 131638.95220 | 2.77376 | 3.95437 | 88.69221 | 3.70152 | 1.12738 | 4.57224 | 4.05133 | 2.71673 | 41.70152 | 506.45627 | 35.51141 | 43096.65672 | 93764.35780 | 2.80000 | 4.01194 | 82.74478 | 3.65075 | 1.16418 | 4.46567 | 4.11343 | 2.66567 | 40.05970 | 542.67164 | 34.95522 | 28979.59184 | 60630.54924 | 2.57143 | 4.08163 | 38.53061 | 3.48980 | 1.18367 | 4.93878 | 3.87755 | 2.48980 | 36.69388 | 559.16327 | 31.34694 | 0.32331 | 0.45579 | 1.08156 | 1.26442 | 0.50737 | 0.81048 | 0.00000 | 0.65613 | 0.49366 | 0.73618 | 0.79134 | 0.82929 | 0.84905 | 0.46407 | 0.63990 | 1.07143 | 1.24628 | 0.54384 | 0.82175 | 0.00000 | 0.67179 | 0.48621 | 0.75028 | 0.82377 | 0.77395 | 0.88685 | 0.69014 | 0.98960 | 1.16667 | 1.22500 | 1.16790 | 0.85965 | 0.00000 | 0.60744 | 0.51579 | 0.80328 | 0.89933 | 0.75112 | 0.98893 | 24371.80822 | 68176.86310 | 3.06164 | 3.89041 | 109.66233 | 3.53425 | 0.47945 | 4.39041 | 3.83562 | 2.72603 | 45.54110 | 608.65753 | 36.05479 | 0.82062 | 0.88006 | 0.97987 | 1.28521 | 0.41035 | 0.84884 | 0.00000 | 0.68331 | 0.52143 | 0.73367 | 0.72462 | 0.69004 | 0.85980 |
1 | 2 | 2 | 18 | 52 | 85 | 1970 | 1 | 1992 | 1 | 1 | 1 | 12 | NaN | 4.00000 | 2013.00000 | 20000 | 1 | nan | 110.00000 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | NaN | 170 | 110 | 5 | 4 | 3 | 1 | 1.00000 | 2 | 2 | 1 | 3 | 5 | 1 | 2 | 3 | 4 | 3.00000 | 5.00000 | 4.00000 | 3.00000 | 2.00000 | 3.00000 | 4.00000 | 5.00000 | 1.00000 | 2 | 4 | 3 | 6.00000 | 2.00000 | 1 | 3 | 6 | 4 | 8 | 5 | 1 | 3.00000 | 2.00000 | 1.00000 | 3.00000 | 1 | 1 | 1 | 1 | 40000.00000 | 3 | 4 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 0 | 0 | 0.00000 | 1 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 1972 | 3 | 1 | 2 | 1973 | 3 | 1 | 2 | 1 | 1 | 4 | 2 | 50000.00000 | 5.00000 | 4.00000 | 4.00000 | 3.00000 | 5.00000 | 3.00000 | 3.00000 | 3.00000 | 2.00000 | 3.00000 | 3.00000 | 3.00000 | 2.00000 | 3 | 90.00000 | 70.00000 | 70.00000 | 80.00000 | 85.00000 | 70.00000 | 90.00000 | 60.00000 | 60.00000 | 0 | 0 | 0 | 0 | 0 | -22 | NaN | -1992.00000 | -1992.00000 | 0.00000 | 0.00000 | -1992.00000 | 20000.00000 | 20001.00000 | 0.49999 | 0.49999 | 0.39999 | 13289.03654 | 6644.51827 | 181.80165 | 181.80165 | 363.60331 | 36.54485 | 2 | 2 | 1 | 39.00000 | 675.00000 | 43.00000 | 29806.23024 | 37759.04350 | 3.24914 | 3.99141 | 148.46392 | 3.95533 | 0.96907 | 4.41409 | 3.96048 | 2.73711 | 47.19416 | 625.23368 | 41.43471 | 64016.12565 | 34201.29660 | 3.37173 | 3.98953 | 157.52880 | 3.96859 | 0.94241 | 4.33508 | 3.96335 | 2.81152 | 48.30366 | 634.46073 | 41.73298 | 11628.65979 | 34759.16972 | 3.35052 | 3.95876 | 154.60825 | 3.92784 | 0.91753 | 4.12371 | 3.94845 | 2.75258 | 48.62887 | 629.12371 | 42.73196 | 0.67100 | 1.05935 | 0.92332 | 0.75161 | 0.74092 | 1.26412 | 2.06383 | 1.35928 | 1.00998 | 1.46139 | 0.82637 | 1.07960 | 1.01332 | 0.31242 | 1.16955 | 0.88975 | 0.75197 | 0.69829 | 1.25989 | 2.12222 | 1.38406 | 1.00925 | 1.42272 | 0.80739 | 1.06390 | 1.03036 | 1.71989 | 1.15078 | 0.89538 | 0.75781 | 0.71148 | 1.27297 | 2.17978 | 1.45500 | 1.01305 | 1.45318 | 0.80199 | 1.07292 | 1.00627 | 20771.90083 | 71278.33379 | 3.15702 | 4.09091 | 115.44628 | 4.23967 | 1.97521 | 4.46281 | 4.48760 | 2.94215 | 38.54545 | 579.08264 | 39.10744 | 0.96284 | 0.56118 | 0.95026 | 0.73333 | 0.95282 | 1.17934 | 1.01255 | 1.34444 | 0.89134 | 1.35955 | 1.01179 | 1.16564 | 1.09954 |
2 | 3 | 2 | 29 | 83 | 126 | 1970 | 2 | 1967 | 1 | 0 | 3 | 4 | NaN | 4.00000 | 0.00000 | 2000 | 1 | nan | 120.00000 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 160 | 122 | 4 | 4 | 5 | 1 | 1.00000 | 2 | 2 | 2 | 5 | 1 | 3 | 1 | 4 | 4 | 3.00000 | 5.00000 | 4.00000 | 4.00000 | 2.00000 | 3.00000 | 5.00000 | 5.00000 | 5.00000 | 3 | 4 | 2 | 2.00000 | 5.00000 | 2 | 4 | 5 | 4 | 6 | 3 | 2 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 1 | 1 | 2 | 2 | 8000.00000 | 3 | 3 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 0 | 2 | 1.00000 | 3 | 1990.00000 | 1968.00000 | 1990.00000 | 3.00000 | 1.00000 | 1.00000 | 6000.00000 | 3.00000 | 0.00000 | 0.00000 | 1945 | 1 | 1 | 2 | 1940 | 1 | 1 | 2 | 2 | 1 | 4 | 2 | 80000.00000 | 3.00000 | 3.00000 | 3.00000 | 3.00000 | 4.00000 | 3.00000 | 3.00000 | 3.00000 | 3.00000 | 3.00000 | nan | 3.00000 | 1.00000 | 4 | 90.00000 | 80.00000 | 75.00000 | 79.00000 | 80.00000 | 90.00000 | 90.00000 | 90.00000 | 75.00000 | 3 | 1 | 0 | 2 | 0 | 3 | 0 | 23.00000 | 23.00000 | 0.00000 | 22.00000 | 1.00000 | 0.33328 | 8001.00000 | 0.24997 | 0.99988 | 0.02500 | 2657.80731 | 664.45183 | 16.66528 | 66.66111 | 66.66111 | 39.86711 | 1 | 1 | 2 | 45.00000 | 749.00000 | 35.00000 | 16246.74797 | 95335.88225 | 3.29810 | 3.64770 | 107.90244 | 3.58266 | 1.02981 | 4.24390 | 3.59621 | 2.60705 | 46.65312 | 623.01355 | 36.20054 | 6522.10526 | 32477.32250 | 3.29474 | 3.95789 | 120.68421 | 3.68421 | 0.78947 | 4.50526 | 3.84211 | 2.76842 | 48.26316 | 654.54737 | 38.77895 | 6522.10526 | 32477.32250 | 3.29474 | 3.95789 | 120.68421 | 3.68421 | 0.78947 | 4.50526 | 3.84211 | 2.76842 | 48.26316 | 654.54737 | 38.77895 | 0.12310 | 0.08391 | 1.21282 | 1.37073 | 1.11212 | 1.11649 | 0.97105 | 1.17816 | 1.11228 | 1.15073 | 0.96457 | 1.20222 | 0.94085 | 0.30665 | 0.24633 | 1.21406 | 1.26330 | 0.99433 | 1.08571 | 1.26667 | 1.10981 | 1.04110 | 1.08365 | 0.93239 | 1.14430 | 0.90255 | 0.30665 | 0.24633 | 1.21406 | 1.26330 | 0.99433 | 1.08571 | 1.26667 | 1.10981 | 1.04110 | 1.08365 | 0.93239 | 1.14430 | 0.90255 | 23468.98734 | 51169.41013 | 3.02532 | 3.72152 | 123.41519 | 3.47679 | 0.83966 | 4.18143 | 3.78903 | 2.51899 | 46.56118 | 607.13080 | 37.77215 | 0.08522 | 0.15634 | 1.32218 | 1.34354 | 0.97233 | 1.15049 | 1.19095 | 1.19576 | 1.05568 | 1.19095 | 0.96647 | 1.23367 | 0.92661 |
3 | 4 | 2 | 10 | 28 | 51 | 1970 | 2 | 1943 | 1 | 1 | 1 | 3 | NaN | 4.00000 | 1959.00000 | 6420 | 1 | nan | 78.00000 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | NaN | 163 | 170 | 4 | 4 | 4 | 1 | 2.00000 | 2 | 1 | 1 | 5 | 1 | 1 | 1 | 5 | 2 | 4.00000 | 5.00000 | 4.00000 | 5.00000 | 1.00000 | 1.00000 | 5.00000 | 5.00000 | 5.00000 | 2 | 4 | 4 | 1.00000 | 6.00000 | 1 | 4 | 5 | 5 | 7 | 2 | 4 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 2 | 2 | 2 | 2 | 12000.00000 | 3 | 3 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 1 | 4 | 0.00000 | 7 | 1960.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 1945 | 14 | 1 | 2 | 1940 | 1 | 1 | 2 | 2 | 1 | 3 | 2 | 10000.00000 | 3.00000 | 3.00000 | 4.00000 | 3.00000 | 5.00000 | 3.00000 | 3.00000 | 5.00000 | 4.00000 | 3.00000 | 3.00000 | 3.00000 | 2.00000 | 3 | 100.00000 | 90.00000 | 70.00000 | 80.00000 | 80.00000 | 90.00000 | 90.00000 | 80.00000 | 80.00000 | 2 | 0 | 0 | 2 | 0 | 27 | 2 | 17.00000 | -1943.00000 | -1960.00000 | 0.00000 | -1943.00000 | 6420.00000 | 6421.00000 | 0.53496 | 0.53496 | 0.64194 | 3986.71096 | 2132.89037 | 82.29714 | 82.29714 | 153.82643 | 25.91362 | 2 | 0 | 3 | 43.00000 | 760.00000 | 44.00000 | 26771.01739 | 61037.02550 | 3.40174 | 3.95478 | 113.82609 | 3.82783 | 0.91826 | 4.57739 | 3.92348 | 2.77217 | 45.37565 | 655.56870 | 42.19826 | 45077.07071 | 102113.61806 | 3.25253 | 4.03030 | 98.85859 | 4.07071 | 0.69697 | 4.46465 | 4.09091 | 2.69697 | 42.72727 | 631.63636 | 38.33333 | 45077.07071 | 102113.61806 | 3.25253 | 4.03030 | 98.85859 | 4.07071 | 0.69697 | 4.46465 | 4.09091 | 2.69697 | 42.72727 | 631.63636 | 38.33333 | 0.23981 | 0.19660 | 1.17587 | 1.01143 | 0.68526 | 1.04498 | 2.17803 | 1.09233 | 1.01950 | 1.08218 | 0.94764 | 1.15930 | 1.01856 | 0.14242 | 0.11752 | 1.22981 | 0.99248 | 0.78901 | 0.98263 | 2.86957 | 1.11991 | 0.97778 | 1.11236 | 1.00638 | 1.20322 | 1.14783 | 0.14242 | 0.11752 | 1.22981 | 0.99248 | 0.78901 | 0.98263 | 2.86957 | 1.11991 | 0.97778 | 1.11236 | 1.00638 | 1.20322 | 1.14783 | 15072.37500 | 123040.45302 | 3.34821 | 3.59821 | 121.43304 | 3.13393 | 0.40179 | 4.16964 | 3.36607 | 2.58036 | 47.81250 | 607.25893 | 36.85714 | 0.42594 | 0.09753 | 1.19467 | 1.11166 | 0.64233 | 1.27635 | 4.97778 | 1.19914 | 1.18833 | 1.16263 | 0.89935 | 1.25153 | 1.19380 |
4 | 5 | 1 | 7 | 18 | 36 | 1970 | 2 | 1994 | 1 | 1 | 1 | 12 | NaN | 1.00000 | 2014.00000 | 0 | 2 | nan | 70.00000 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | NaN | 165 | 110 | 5 | 5 | 3 | 2 | 3.00000 | 1 | 3 | 4 | 2 | 5 | 5 | 3 | 3 | 3 | 2.00000 | 4.00000 | 4.00000 | 3.00000 | 5.00000 | 2.00000 | 5.00000 | 5.00000 | 1.00000 | 4 | 3 | 4 | 7.00000 | 5.00000 | 3 | 2 | 1 | 1 | 1 | 4 | 6 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 1 | 2 | 2 | 2 | 66365.63755 | 4 | 3 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | 0 | 0 | 0.00000 | 1 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 0.00000 | 1970 | 6 | 1 | 10 | 1972 | 4 | 1 | 15 | 3 | 2 | 3 | 2 | 200000.00000 | 4.00000 | 3.00000 | 3.00000 | 3.00000 | 5.00000 | 5.00000 | 3.00000 | 4.00000 | 3.00000 | 3.00000 | 3.00000 | 3.00000 | 2.00000 | 2 | 50.00000 | 50.00000 | 50.00000 | 50.00000 | 50.00000 | 50.00000 | 50.00000 | 50.00000 | 50.00000 | 2 | 1 | 1 | 0 | 0 | -24 | NaN | -1994.00000 | -1994.00000 | 0.00000 | 0.00000 | -1994.00000 | 0.00000 | 1.00000 | 0.00000 | 0.00000 | 0.00000 | 16550.03430 | 0.00000 | 0.00000 | 0.00000 | 947.94512 | 17.45636 | 0 | 0 | -3 | 40.00000 | 450.00000 | 44.00000 | 33930.56944 | 73738.68755 | 3.30903 | 4.09375 | 57.52639 | 3.77431 | 0.52778 | 4.44444 | 4.12500 | 2.72917 | 44.44792 | 626.34028 | 32.63194 | 33930.56944 | 73738.68755 | 3.30903 | 4.09375 | 57.52639 | 3.77431 | 0.52778 | 4.44444 | 4.12500 | 2.72917 | 44.44792 | 626.34028 | 32.63194 | 28430.76923 | 85328.92789 | 3.58974 | 3.97436 | 57.55385 | 4.00000 | 0.66667 | 3.64103 | 4.17949 | 3.02564 | 44.84615 | 513.07692 | 39.25641 | 0.00000 | 0.90001 | 0.60441 | 0.73282 | 1.21683 | 1.32475 | 0.00000 | 0.22500 | 1.21212 | 1.09924 | 0.89993 | 0.71846 | 1.30828 | 0.00000 | 0.90001 | 0.60441 | 0.73282 | 1.21683 | 1.32475 | 0.00000 | 0.22500 | 1.21212 | 1.09924 | 0.89993 | 0.71846 | 1.34837 | 0.00000 | 0.77776 | 0.55714 | 0.75484 | 1.21625 | 1.25000 | 0.00000 | 0.27465 | 1.19632 | 0.99153 | 0.89194 | 0.87706 | 1.12084 | 14158.16514 | 69150.13854 | 3.08257 | 4.00000 | 119.19266 | 4.24771 | 1.77982 | 4.22018 | 4.47706 | 2.74312 | 38.26606 | 595.24312 | 41.74312 | 0.00000 | 0.95973 | 0.64881 | 0.75000 | 0.58728 | 1.17711 | 0.00000 | 0.23696 | 1.11680 | 1.09365 | 1.04531 | 0.75599 | 1.05407 |
我们还应该删去有效样本数很少的特征,例如负值太多的特征或者是缺失值太多的特征,这里我一共删除了包括“目前的最高教育程度”在内的9类特征,得到了最终的263维的特征
#272-9=263
#删除数值特别少的和之前用过的特征
del_list=['id','survey_time','edu_other','invest_other','property_other','join_party','province','city','county']
use_feature = [clo for clo in data.columns if clo not in del_list]
data.fillna(0,inplace=True) #还是补0
train_shape = train.shape[0] #一共的数据量,训练集
features = data[use_feature].columns #删除后所有的特征
X_train_263 = data[:train_shape][use_feature].values
y_train = target
X_test_263 = data[train_shape:][use_feature].values
X_train_263.shape #最终一种263个特征
(7988, 263)
这里选择了最重要的49个特征,作为除了以上263维特征外的另外一组特征
imp_fea_49 = ['equity','depression','health','class','family_status','health_problem','class_10_after',
'equity/province','equity/city','equity/county',
'depression/province','depression/city','depression/county',
'health/province','health/city','health/county',
'class/province','class/city','class/county',
'family_status/province','family_status/city','family_status/county',
'family_income/province','family_income/city','family_income/county',
'floor_area/province','floor_area/city','floor_area/county',
'leisure_sum/province','leisure_sum/city','leisure_sum/county',
'public_service_sum/province','public_service_sum/city','public_service_sum/county',
'trust_sum/province','trust_sum/city','trust_sum/county',
'income/m','public_service_sum','class_diff','status_3_before','age_income_mean','age_floor_area_mean',
'weight_jin','height_cm',
'health/age','depression/age','equity/age','leisure_sum/age'
]
train_shape = train.shape[0]
X_train_49 = data[:train_shape][imp_fea_49].values
X_test_49 = data[train_shape:][imp_fea_49].values
X_train_49.shape #最重要的49个特征
(7988, 49)
选择需要进行onehot编码的离散变量进行one-hot编码,再合成为第三类特征,共383维。
cat_fea = ['survey_type','gender','nationality','edu_status','political','hukou','hukou_loc','work_exper','work_status','work_type',
'work_manage','marital','s_political','s_hukou','s_work_exper','s_work_status','s_work_type','f_political','f_work_14',
'm_political','m_work_14']
noc_fea = [clo for clo in use_feature if clo not in cat_fea]
onehot_data = data[cat_fea].values
enc = preprocessing.OneHotEncoder(categories = 'auto')
oh_data=enc.fit_transform(onehot_data).toarray()
oh_data.shape #变为onehot编码格式
X_train_oh = oh_data[:train_shape,:]
X_test_oh = oh_data[train_shape:,:]
X_train_oh.shape #其中的训练集
X_train_383 = np.column_stack([data[:train_shape][noc_fea].values,X_train_oh])#先是noc,再是cat_fea
X_test_383 = np.column_stack([data[train_shape:][noc_fea].values,X_test_oh])
X_train_383.shape
(7988, 383)
基于此,我们构建完成了三种特征工程(训练数据集),其一是上面提取的最重要的49中特征,其中包括健康程度、社会阶级、在同龄人中的收入情况等等特征。其二是扩充后的263维特征(这里可以认为是初始特征)。其三是使用One-hot编码后的特征,这里要使用One-hot进行编码的原因在于,有部分特征为分离值,例如性别中男女,男为1,女为2,我们想使用One-hot将其变为男为0,女为1,来增强机器学习算法的鲁棒性能;再如民族这个特征,原本是1-56这56个数值,如果直接分类会让分类器的鲁棒性变差,所以使用One-hot编码将其变为6个特征进行非零即一的处理。
特征建模
LightGBM(263维)
##### lgb_263 #
#lightGBM决策树
lgb_263_param = {
'num_leaves': 7,
'min_data_in_leaf': 20, #叶子可能具有的最小记录数
'objective':'regression',
'max_depth': -1,
'learning_rate': 0.003,
"boosting": "gbdt", #用gbdt算法
"feature_fraction": 0.18, #例如 0.18时,意味着在每次迭代中随机选择18%的参数来建树
"bagging_freq": 1,
"bagging_fraction": 0.55, #每次迭代时用的数据比例
"bagging_seed": 14,
"metric": 'mse',
"lambda_l1": 0.1005,
"lambda_l2": 0.1996,
"verbosity": -1}
train_X,test_X,train_y,test_y = train_test_split(X_train_263, y_train, test_size=0.3, random_state=5)
trn_data = lgb.Dataset(train_X, train_y)
val_data = lgb.Dataset(test_X, test_y)
num_round = 10000
lgb_263 = lgb.train(lgb_263_param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 800)
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.495527 valid_1's l2: 0.528734
[1000] training's l2: 0.446779 valid_1's l2: 0.49778
[1500] training's l2: 0.420544 valid_1's l2: 0.485043
[2000] training's l2: 0.401514 valid_1's l2: 0.478223
[2500] training's l2: 0.386322 valid_1's l2: 0.474543
[3000] training's l2: 0.373553 valid_1's l2: 0.472387
[3500] training's l2: 0.362076 valid_1's l2: 0.471185
[4000] training's l2: 0.351497 valid_1's l2: 0.46987
[4500] training's l2: 0.341777 valid_1's l2: 0.469206
[5000] training's l2: 0.332783 valid_1's l2: 0.468999
[5500] training's l2: 0.324267 valid_1's l2: 0.468622
[6000] training's l2: 0.316143 valid_1's l2: 0.468594
[6500] training's l2: 0.308414 valid_1's l2: 0.468692
[7000] training's l2: 0.301177 valid_1's l2: 0.468907
Early stopping, best iteration is:
[6282] training's l2: 0.311767 valid_1's l2: 0.468444
接着,我使用已经训练完的lightGBM的模型进行特征重要性的判断以及可视化,从结果我们可以看出,“排在重要性第一位的是health/age,就是同龄人中的健康程度,与我们主观的看法基本一致。” 引号部分是原文的内容,我这里特征选择没有用到特征交叉,结果上略有不同。
df = pd.DataFrame(data[use_feature].columns.tolist(), columns=['feature'])
df['importance']=list(lgb_263.feature_importance())
df = df.sort_values(by='importance',ascending=False)
plt.figure(figsize=(10,6))
sns.barplot(x="importance", y="feature", data=df.head(25))
plt.title('Features importance (averaged/folds)')
plt.tight_layout()
## 五折交叉还是要做的,后面要用到预测结果
##### lgb_263 #
#lightGBM决策树
lgb_263_param = {
'num_leaves': 7,
'min_data_in_leaf': 20, #叶子可能具有的最小记录数
'objective':'regression',
'max_depth': -1,
'learning_rate': 0.003,
"boosting": "gbdt", #用gbdt算法
"feature_fraction": 0.18, #例如 0.18时,意味着在每次迭代中随机选择18%的参数来建树
"bagging_freq": 1,
"bagging_fraction": 0.55, #每次迭代时用的数据比例
"bagging_seed": 14,
"metric": 'mse',
"lambda_l1": 0.1005,
"lambda_l2": 0.1996,
"verbosity": -1}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4) #交叉切分:5
oof_lgb_263 = np.zeros(len(X_train_263))
predictions_lgb_263 = np.zeros(len(X_test_263))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_263, y_train)):
print("fold n°{}".format(fold_+1))
trn_data = lgb.Dataset(X_train_263[trn_idx], y_train[trn_idx])
val_data = lgb.Dataset(X_train_263[val_idx], y_train[val_idx])#train:val=4:1
num_round = 10000
lgb_263 = lgb.train(lgb_263_param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=500, early_stopping_rounds = 800)
oof_lgb_263[val_idx] = lgb_263.predict(X_train_263[val_idx], num_iteration=lgb_263.best_iteration)
predictions_lgb_263 += lgb_263.predict(X_test_263, num_iteration=lgb_263.best_iteration) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb_263, target)))
fold n°1
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.500105 valid_1's l2: 0.533762
[1000] training's l2: 0.451898 valid_1's l2: 0.501204
[1500] training's l2: 0.425907 valid_1's l2: 0.48827
[2000] training's l2: 0.407901 valid_1's l2: 0.482193
[2500] training's l2: 0.393494 valid_1's l2: 0.47853
[3000] training's l2: 0.381196 valid_1's l2: 0.476353
[3500] training's l2: 0.370417 valid_1's l2: 0.475057
[4000] training's l2: 0.360612 valid_1's l2: 0.47378
[4500] training's l2: 0.351611 valid_1's l2: 0.472922
[5000] training's l2: 0.343226 valid_1's l2: 0.472396
[5500] training's l2: 0.33543 valid_1's l2: 0.472008
[6000] training's l2: 0.327837 valid_1's l2: 0.471958
[6500] training's l2: 0.320536 valid_1's l2: 0.471644
[7000] training's l2: 0.313701 valid_1's l2: 0.471573
[7500] training's l2: 0.307048 valid_1's l2: 0.471296
[8000] training's l2: 0.30071 valid_1's l2: 0.471248
[8500] training's l2: 0.294537 valid_1's l2: 0.471629
Early stopping, best iteration is:
[7884] training's l2: 0.302149 valid_1's l2: 0.471091
fold n°2
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.504839 valid_1's l2: 0.514284
[1000] training's l2: 0.455739 valid_1's l2: 0.479913
[1500] training's l2: 0.429686 valid_1's l2: 0.466668
[2000] training's l2: 0.4119 valid_1's l2: 0.459783
[2500] training's l2: 0.397772 valid_1's l2: 0.455913
[3000] training's l2: 0.385641 valid_1's l2: 0.453108
[3500] training's l2: 0.374976 valid_1's l2: 0.451613
[4000] training's l2: 0.365204 valid_1's l2: 0.450372
[4500] training's l2: 0.356057 valid_1's l2: 0.44945
[5000] training's l2: 0.347606 valid_1's l2: 0.448612
[5500] training's l2: 0.339646 valid_1's l2: 0.448144
[6000] training's l2: 0.331975 valid_1's l2: 0.4479
[6500] training's l2: 0.324745 valid_1's l2: 0.448042
Early stopping, best iteration is:
[6067] training's l2: 0.330983 valid_1's l2: 0.44778
fold n°3
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.503936 valid_1's l2: 0.518085
[1000] training's l2: 0.455865 valid_1's l2: 0.481595
[1500] training's l2: 0.430442 valid_1's l2: 0.465355
[2000] training's l2: 0.412943 valid_1's l2: 0.45614
[2500] training's l2: 0.398789 valid_1's l2: 0.450578
[3000] training's l2: 0.386766 valid_1's l2: 0.44719
[3500] training's l2: 0.375948 valid_1's l2: 0.445124
[4000] training's l2: 0.366131 valid_1's l2: 0.443269
[4500] training's l2: 0.357128 valid_1's l2: 0.442377
[5000] training's l2: 0.348574 valid_1's l2: 0.441793
[5500] training's l2: 0.340426 valid_1's l2: 0.44133
[6000] training's l2: 0.332757 valid_1's l2: 0.44066
[6500] training's l2: 0.325354 valid_1's l2: 0.440181
[7000] training's l2: 0.318332 valid_1's l2: 0.439854
[7500] training's l2: 0.311695 valid_1's l2: 0.439868
[8000] training's l2: 0.305354 valid_1's l2: 0.439696
[8500] training's l2: 0.299172 valid_1's l2: 0.43965
[9000] training's l2: 0.293286 valid_1's l2: 0.439621
Early stopping, best iteration is:
[8696] training's l2: 0.296822 valid_1's l2: 0.43951
fold n°4
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.505016 valid_1's l2: 0.512576
[1000] training's l2: 0.45639 valid_1's l2: 0.477842
[1500] training's l2: 0.42992 valid_1's l2: 0.46591
[2000] training's l2: 0.411808 valid_1's l2: 0.460025
[2500] training's l2: 0.397571 valid_1's l2: 0.456871
[3000] training's l2: 0.385532 valid_1's l2: 0.455048
[3500] training's l2: 0.374799 valid_1's l2: 0.453953
[4000] training's l2: 0.364972 valid_1's l2: 0.453128
[4500] training's l2: 0.355899 valid_1's l2: 0.452261
[5000] training's l2: 0.347329 valid_1's l2: 0.451682
[5500] training's l2: 0.339332 valid_1's l2: 0.451483
[6000] training's l2: 0.331779 valid_1's l2: 0.451406
[6500] training's l2: 0.324391 valid_1's l2: 0.451211
[7000] training's l2: 0.317509 valid_1's l2: 0.450953
[7500] training's l2: 0.310877 valid_1's l2: 0.450869
[8000] training's l2: 0.304364 valid_1's l2: 0.450884
[8500] training's l2: 0.298221 valid_1's l2: 0.450923
[9000] training's l2: 0.292226 valid_1's l2: 0.450984
Early stopping, best iteration is:
[8376] training's l2: 0.299734 valid_1's l2: 0.450741
fold n°5
Training until validation scores don't improve for 800 rounds
[500] training's l2: 0.503602 valid_1's l2: 0.520357
[1000] training's l2: 0.455497 valid_1's l2: 0.485568
[1500] training's l2: 0.429776 valid_1's l2: 0.472075
[2000] training's l2: 0.41173 valid_1's l2: 0.465712
[2500] training's l2: 0.397231 valid_1's l2: 0.461858
[3000] training's l2: 0.384902 valid_1's l2: 0.459726
[3500] training's l2: 0.373874 valid_1's l2: 0.458437
[4000] training's l2: 0.36392 valid_1's l2: 0.457671
[4500] training's l2: 0.354662 valid_1's l2: 0.457373
[5000] training's l2: 0.34601 valid_1's l2: 0.45723
[5500] training's l2: 0.337775 valid_1's l2: 0.456842
[6000] training's l2: 0.330031 valid_1's l2: 0.456806
[6500] training's l2: 0.322637 valid_1's l2: 0.456948
Early stopping, best iteration is:
[5983] training's l2: 0.330287 valid_1's l2: 0.456736
CV score: 0.45317145
xgb_263
xgb_263_params = {'eta': 0.02, #lr
'max_depth': 6,
'min_child_weight':3,#最小叶子节点样本权重和
'gamma':0, #指定节点分裂所需的最小损失函数下降值。
'subsample': 0.7, #控制对于每棵树,随机采样的比例
'colsample_bytree': 0.3, #用来控制每棵随机采样的列数的占比 (每一列是一个特征)。
'lambda':2,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': True,
'nthread': -1}
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_xgb_263 = np.zeros(len(X_train_263))
predictions_xgb_263 = np.zeros(len(X_test_263))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_263, y_train)):
print("fold n°{}".format(fold_+1))
trn_data = xgb.DMatrix(X_train_263[trn_idx], y_train[trn_idx])
val_data = xgb.DMatrix(X_train_263[val_idx], y_train[val_idx])
watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
xgb_263 = xgb.train(dtrain=trn_data, num_boost_round=3000, evals=watchlist, early_stopping_rounds=600, verbose_eval=500, params=xgb_263_params)
oof_xgb_263[val_idx] = xgb_263.predict(xgb.DMatrix(X_train_263[val_idx]), ntree_limit=xgb_263.best_ntree_limit)
predictions_xgb_263 += xgb_263.predict(xgb.DMatrix(X_test_263), ntree_limit=xgb_263.best_ntree_limit) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb_263, target)))
fold n°1
[16:02:16] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:02:16] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.40425 valid_data-rmse:3.38315
[500] train-rmse:0.40233 valid_data-rmse:0.70451
[1000] train-rmse:0.26484 valid_data-rmse:0.70783
[1124] train-rmse:0.23926 valid_data-rmse:0.70855
fold n°2
[16:02:22] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:02:22] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.39812 valid_data-rmse:3.40796
[500] train-rmse:0.40533 valid_data-rmse:0.69418
[1000] train-rmse:0.27276 valid_data-rmse:0.69525
[1234] train-rmse:0.22634 valid_data-rmse:0.69617
fold n°3
[16:02:30] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:02:30] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.40186 valid_data-rmse:3.39314
[500] train-rmse:0.40914 valid_data-rmse:0.66119
[1000] train-rmse:0.27399 valid_data-rmse:0.66309
[1076] train-rmse:0.25858 valid_data-rmse:0.66340
fold n°4
[16:02:36] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:02:36] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.40238 valid_data-rmse:3.39016
[500] train-rmse:0.41132 valid_data-rmse:0.66302
[1000] train-rmse:0.27112 valid_data-rmse:0.66421
[1196] train-rmse:0.23171 valid_data-rmse:0.66502
fold n°5
[16:02:43] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:02:43] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.39341 valid_data-rmse:3.42638
[500] train-rmse:0.41459 valid_data-rmse:0.65110
[1000] train-rmse:0.27624 valid_data-rmse:0.65028
[1434] train-rmse:0.19504 valid_data-rmse:0.65228
CV score: 0.45491694
RandomForestRegressor随机森林 – rfr_263
#RandomForestRegressor随机森林
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_rfr_263 = np.zeros(len(X_train_263))
predictions_rfr_263 = np.zeros(len(X_test_263))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_263, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_263[trn_idx]
tr_y = y_train[trn_idx]
rfr_263 = rfr(n_estimators=1600,max_depth=9, min_samples_leaf=9, min_weight_fraction_leaf=0.0,
max_features=0.25,verbose=1,n_jobs=-1)
#verbose = 0 为不在标准输出流输出日志信息
#verbose = 1 为输出进度条记录
#verbose = 2 为每个epoch输出一行记录
rfr_263.fit(tr_x,tr_y)
oof_rfr_263[val_idx] = rfr_263.predict(X_train_263[val_idx])
predictions_rfr_263 += rfr_263.predict(X_test_263) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_rfr_263, target)))
fold n°1
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.0s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 2.5s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 4.5s
[Parallel(n_jobs=-1)]: Done 1218 tasks | elapsed: 7.3s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 9.6s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.2s finished
fold n°2
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.0s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 2.5s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 4.6s
[Parallel(n_jobs=-1)]: Done 1218 tasks | elapsed: 7.4s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 9.7s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.2s finished
fold n°3
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.0s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 2.5s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 4.6s
[Parallel(n_jobs=-1)]: Done 1218 tasks | elapsed: 7.4s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 9.7s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.2s finished
fold n°4
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.0s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 2.6s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 4.6s
[Parallel(n_jobs=-1)]: Done 1218 tasks | elapsed: 7.5s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 9.8s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.2s finished
fold n°5
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.0s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 2.5s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 4.7s
[Parallel(n_jobs=-1)]: Done 1218 tasks | elapsed: 7.6s
[Parallel(n_jobs=-1)]: Done 1600 out of 1600 | elapsed: 10.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1218 tasks | elapsed: 0.1s
CV score: 0.47865876
[Parallel(n_jobs=16)]: Done 1600 out of 1600 | elapsed: 0.2s finished
GradientBoostingRegressor梯度提升决策树 - gbr_263
#GradientBoostingRegressor梯度提升决策树
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
oof_gbr_263 = np.zeros(train_shape)
predictions_gbr_263 = np.zeros(len(X_test_263))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_263, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_263[trn_idx]
tr_y = y_train[trn_idx]
gbr_263 = gbr(n_estimators=400, learning_rate=0.01,subsample=0.65,max_depth=7, min_samples_leaf=20,
max_features=0.22,verbose=1)
gbr_263.fit(tr_x,tr_y)
oof_gbr_263[val_idx] = gbr_263.predict(X_train_263[val_idx])
predictions_gbr_263 += gbr_263.predict(X_test_263) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_gbr_263, target)))
fold n°1
Iter Train Loss OOB Improve Remaining Time
1 0.6742 0.0035 20.36s
2 0.6835 0.0029 20.90s
3 0.6623 0.0030 20.38s
4 0.6654 0.0031 20.05s
5 0.6421 0.0031 19.87s
6 0.6612 0.0030 19.67s
7 0.6414 0.0026 19.51s
8 0.6213 0.0032 19.29s
9 0.6220 0.0030 19.23s
10 0.6189 0.0029 19.09s
20 0.5857 0.0023 18.26s
30 0.5654 0.0018 17.81s
40 0.5378 0.0015 17.25s
50 0.5114 0.0011 16.72s
60 0.4956 0.0010 16.20s
70 0.4735 0.0009 15.71s
80 0.4616 0.0008 15.22s
90 0.4451 0.0005 14.73s
100 0.4381 0.0007 14.23s
200 0.3495 0.0001 9.48s
300 0.3024 0.0000 4.75s
400 0.2594 -0.0000 0.00s
fold n°2
Iter Train Loss OOB Improve Remaining Time
1 0.6601 0.0035 18.70s
2 0.6489 0.0034 18.91s
3 0.6520 0.0031 19.39s
4 0.6490 0.0032 19.26s
5 0.6657 0.0032 19.01s
6 0.6445 0.0031 18.86s
7 0.6557 0.0029 18.76s
8 0.6252 0.0031 18.73s
9 0.6250 0.0028 18.65s
10 0.6231 0.0029 18.52s
20 0.5874 0.0026 18.04s
30 0.5683 0.0018 17.55s
40 0.5331 0.0016 17.15s
50 0.5154 0.0013 16.74s
60 0.4974 0.0011 16.30s
70 0.4598 0.0009 15.79s
80 0.4679 0.0007 15.32s
90 0.4426 0.0006 14.82s
100 0.4364 0.0006 14.32s
200 0.3447 0.0001 9.48s
300 0.3114 0.0000 4.76s
400 0.2659 -0.0000 0.00s
fold n°3
Iter Train Loss OOB Improve Remaining Time
1 0.6560 0.0032 17.97s
2 0.6492 0.0032 18.64s
3 0.6368 0.0033 19.22s
4 0.6577 0.0030 19.13s
5 0.6475 0.0033 18.90s
6 0.6436 0.0029 18.87s
7 0.6304 0.0029 18.80s
8 0.6354 0.0028 18.81s
9 0.6259 0.0028 18.77s
10 0.6152 0.0031 18.72s
20 0.5884 0.0022 18.02s
30 0.5548 0.0020 17.50s
40 0.5313 0.0016 17.01s
50 0.5069 0.0013 16.52s
60 0.4869 0.0012 16.01s
70 0.4763 0.0010 15.53s
80 0.4546 0.0009 15.05s
90 0.4415 0.0008 14.57s
100 0.4285 0.0005 14.10s
200 0.3374 0.0001 9.35s
300 0.2855 0.0000 4.67s
400 0.2605 -0.0001 0.00s
fold n°4
Iter Train Loss OOB Improve Remaining Time
1 0.6962 0.0030 19.62s
2 0.6669 0.0035 19.11s
3 0.6505 0.0032 19.12s
4 0.6702 0.0032 19.47s
5 0.6453 0.0033 19.32s
6 0.6574 0.0030 19.14s
7 0.6394 0.0030 19.15s
8 0.6387 0.0032 19.16s
9 0.6342 0.0028 19.08s
10 0.6505 0.0027 18.96s
20 0.5772 0.0023 18.04s
30 0.5693 0.0018 17.43s
40 0.5452 0.0016 16.90s
50 0.5043 0.0013 16.44s
60 0.4812 0.0013 15.93s
70 0.4724 0.0009 15.45s
80 0.4583 0.0008 14.99s
90 0.4345 0.0007 14.53s
100 0.4122 0.0005 14.05s
200 0.3433 0.0001 9.34s
300 0.2942 0.0000 4.67s
400 0.2716 0.0000 0.00s
fold n°5
Iter Train Loss OOB Improve Remaining Time
1 0.6717 0.0033 20.55s
2 0.6576 0.0034 20.20s
3 0.6518 0.0034 20.06s
4 0.6507 0.0028 19.65s
5 0.6388 0.0031 19.60s
6 0.6495 0.0026 19.45s
7 0.6428 0.0027 19.46s
8 0.6456 0.0031 19.36s
9 0.6367 0.0027 19.19s
10 0.6242 0.0028 19.04s
20 0.6137 0.0024 18.29s
30 0.5654 0.0020 17.79s
40 0.5373 0.0017 17.31s
50 0.5158 0.0013 16.80s
60 0.4878 0.0011 16.32s
70 0.4563 0.0011 15.80s
80 0.4510 0.0009 15.33s
90 0.4413 0.0007 14.82s
100 0.4267 0.0006 14.31s
200 0.3464 0.0001 9.56s
300 0.2972 -0.0000 4.83s
400 0.2736 -0.0000 0.00s
CV score: 0.45758664
ExtraTreesRegressor 极端随机森林回归 - etr_263
#ExtraTreesRegressor 极端随机森林回归
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_etr_263 = np.zeros(train_shape)
predictions_etr_263 = np.zeros(len(X_test_263))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_263, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_263[trn_idx]
tr_y = y_train[trn_idx]
etr_263 = etr(n_estimators=1000,max_depth=8, min_samples_leaf=12, min_weight_fraction_leaf=0.0,
max_features=0.4,verbose=1,n_jobs=-1)
etr_263.fit(tr_x,tr_y)
oof_etr_263[val_idx] = etr_263.predict(X_train_263[val_idx])
predictions_etr_263 += etr_263.predict(X_test_263) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_etr_263, target)))
fold n°1
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 0.9s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 2.3s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 4.3s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 5.6s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.1s finished
fold n°2
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.0s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 2.4s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 4.4s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 5.7s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.1s finished
fold n°3
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 0.9s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 2.5s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 4.5s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 5.9s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.1s finished
fold n°4
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 0.9s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 2.3s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 4.3s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 5.8s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.1s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.1s finished
fold n°5
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done 18 tasks | elapsed: 0.1s
[Parallel(n_jobs=-1)]: Done 168 tasks | elapsed: 1.1s
[Parallel(n_jobs=-1)]: Done 418 tasks | elapsed: 2.6s
[Parallel(n_jobs=-1)]: Done 768 tasks | elapsed: 4.6s
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 5.9s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.0s finished
[Parallel(n_jobs=16)]: Using backend ThreadingBackend with 16 concurrent workers.
[Parallel(n_jobs=16)]: Done 18 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 168 tasks | elapsed: 0.0s
CV score: 0.48579059
[Parallel(n_jobs=16)]: Done 418 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 768 tasks | elapsed: 0.0s
[Parallel(n_jobs=16)]: Done 1000 out of 1000 | elapsed: 0.1s finished
至此,我们得到了以上5种模型的预测结果以及模型架构及参数。其中在每一种特征工程中,进行5折的交叉验证,并重复两次(Kernel Ridge Regression,核脊回归),取得每一个特征数下的模型的结果。
train_stack2 = np.vstack([oof_lgb_263,oof_xgb_263,oof_gbr_263,oof_rfr_263,oof_etr_263]).transpose()
# transpose()函数的作用就是调换x,y,z的位置,也就是数组的索引值
test_stack2 = np.vstack([predictions_lgb_263, predictions_xgb_263,predictions_gbr_263,predictions_rfr_263,predictions_etr_263]).transpose()
#交叉验证:5折,重复2次
folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=7)
oof_stack2 = np.zeros(train_stack2.shape[0])
predictions_lr2 = np.zeros(test_stack2.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack2,target)):
print("fold {}".format(fold_))
trn_data, trn_y = train_stack2[trn_idx], target.iloc[trn_idx].values
val_data, val_y = train_stack2[val_idx], target.iloc[val_idx].values
#Kernel Ridge Regression
lr2 = kr()
lr2.fit(trn_data, trn_y)
oof_stack2[val_idx] = lr2.predict(val_data)
predictions_lr2 += lr2.predict(test_stack2) / 10
mean_squared_error(target.values, oof_stack2)
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
0.44930510336227575
对49维数据做相同操作
LightGBM - lgb_49
##### lgb_49
lgb_49_param = {
'num_leaves': 9,
'min_data_in_leaf': 23,
'objective':'regression',
'max_depth': -1,
'learning_rate': 0.002,
"boosting": "gbdt",
"feature_fraction": 0.45,
"bagging_freq": 1,
"bagging_fraction": 0.65,
"bagging_seed": 15,
"metric": 'mse',
"lambda_l2": 0.2,
"verbosity": -1}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=9)
oof_lgb_49 = np.zeros(len(X_train_49))
predictions_lgb_49 = np.zeros(len(X_test_49))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_49, y_train)):
print("fold n°{}".format(fold_+1))
trn_data = lgb.Dataset(X_train_49[trn_idx], y_train[trn_idx])
val_data = lgb.Dataset(X_train_49[val_idx], y_train[val_idx])
num_round = 12000
lgb_49 = lgb.train(lgb_49_param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 1000)
oof_lgb_49[val_idx] = lgb_49.predict(X_train_49[val_idx], num_iteration=lgb_49.best_iteration)
predictions_lgb_49 += lgb_49.predict(X_test_49, num_iteration=lgb_49.best_iteration) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb_49, target)))
fold n°1
Training until validation scores don't improve for 1000 rounds
[1000] training's l2: 0.472114 valid_1's l2: 0.494341
[2000] training's l2: 0.432433 valid_1's l2: 0.474187
[3000] training's l2: 0.410225 valid_1's l2: 0.468771
[4000] training's l2: 0.39252 valid_1's l2: 0.46653
[5000] training's l2: 0.37695 valid_1's l2: 0.465936
[6000] training's l2: 0.362991 valid_1's l2: 0.466085
Early stopping, best iteration is:
[5373] training's l2: 0.371564 valid_1's l2: 0.465725
fold n°2
Training until validation scores don't improve for 1000 rounds
[1000] training's l2: 0.471663 valid_1's l2: 0.494653
[2000] training's l2: 0.431201 valid_1's l2: 0.475828
[3000] training's l2: 0.408572 valid_1's l2: 0.471512
[4000] training's l2: 0.390545 valid_1's l2: 0.470385
[5000] training's l2: 0.375087 valid_1's l2: 0.470372
Early stopping, best iteration is:
[4456] training's l2: 0.383242 valid_1's l2: 0.470227
fold n°3
Training until validation scores don't improve for 1000 rounds
[1000] training's l2: 0.474605 valid_1's l2: 0.488829
[2000] training's l2: 0.43408 valid_1's l2: 0.465845
[3000] training's l2: 0.411324 valid_1's l2: 0.461469
[4000] training's l2: 0.393581 valid_1's l2: 0.461059
Early stopping, best iteration is:
[3492] training's l2: 0.402097 valid_1's l2: 0.46063
fold n°4
Training until validation scores don't improve for 1000 rounds
[1000] training's l2: 0.467855 valid_1's l2: 0.509236
[2000] training's l2: 0.428353 valid_1's l2: 0.492249
[3000] training's l2: 0.406439 valid_1's l2: 0.487728
[4000] training's l2: 0.389146 valid_1's l2: 0.484684
[5000] training's l2: 0.374305 valid_1's l2: 0.482782
[6000] training's l2: 0.361051 valid_1's l2: 0.48114
[7000] training's l2: 0.34888 valid_1's l2: 0.480224
[8000] training's l2: 0.337402 valid_1's l2: 0.479305
[9000] training's l2: 0.326659 valid_1's l2: 0.478466
[10000] training's l2: 0.31654 valid_1's l2: 0.478088
[11000] training's l2: 0.307041 valid_1's l2: 0.477755
[12000] training's l2: 0.297939 valid_1's l2: 0.477363
Did not meet early stopping. Best iteration is:
[12000] training's l2: 0.297939 valid_1's l2: 0.477363
fold n°5
Training until validation scores don't improve for 1000 rounds
[1000] training's l2: 0.46925 valid_1's l2: 0.504587
[2000] training's l2: 0.429297 valid_1's l2: 0.487974
[3000] training's l2: 0.407038 valid_1's l2: 0.484025
[4000] training's l2: 0.389374 valid_1's l2: 0.482396
[5000] training's l2: 0.374063 valid_1's l2: 0.481538
[6000] training's l2: 0.360357 valid_1's l2: 0.48135
Early stopping, best iteration is:
[5811] training's l2: 0.362852 valid_1's l2: 0.481231
CV score: 0.47103334
XGBoost - xgb_49
##### xgb_49
xgb_49_params = {'eta': 0.02,
'max_depth': 5,
'min_child_weight':3,
'gamma':0,
'subsample': 0.7,
'colsample_bytree': 0.35,
'lambda':2,
'objective': 'reg:linear',
'eval_metric': 'rmse',
'silent': True,
'nthread': -1}
folds = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_xgb_49 = np.zeros(len(X_train_49))
predictions_xgb_49 = np.zeros(len(X_test_49))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_49, y_train)):
print("fold n°{}".format(fold_+1))
trn_data = xgb.DMatrix(X_train_49[trn_idx], y_train[trn_idx])
val_data = xgb.DMatrix(X_train_49[val_idx], y_train[val_idx])
watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
xgb_49 = xgb.train(dtrain=trn_data, num_boost_round=3000, evals=watchlist, early_stopping_rounds=600, verbose_eval=500, params=xgb_49_params)
oof_xgb_49[val_idx] = xgb_49.predict(xgb.DMatrix(X_train_49[val_idx]), ntree_limit=xgb_49.best_ntree_limit)
predictions_xgb_49 += xgb_49.predict(xgb.DMatrix(X_test_49), ntree_limit=xgb_49.best_ntree_limit) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb_49, target)))
fold n°1
[16:19:20] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:19:20] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.40423 valid_data-rmse:3.38324
[500] train-rmse:0.52685 valid_data-rmse:0.71752
[1000] train-rmse:0.43482 valid_data-rmse:0.72012
[1178] train-rmse:0.40613 valid_data-rmse:0.72151
fold n°2
[16:19:23] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:19:23] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.39824 valid_data-rmse:3.40770
[500] train-rmse:0.52936 valid_data-rmse:0.70724
[1000] train-rmse:0.43861 valid_data-rmse:0.71032
[1177] train-rmse:0.41100 valid_data-rmse:0.71104
fold n°3
[16:19:26] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:19:26] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.40192 valid_data-rmse:3.39293
[500] train-rmse:0.53754 valid_data-rmse:0.66622
[1000] train-rmse:0.44423 valid_data-rmse:0.66997
[1044] train-rmse:0.43758 valid_data-rmse:0.67012
fold n°4
[16:19:29] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:19:29] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.40241 valid_data-rmse:3.38993
[500] train-rmse:0.53187 valid_data-rmse:0.68099
[1000] train-rmse:0.43976 valid_data-rmse:0.68315
[1062] train-rmse:0.42959 valid_data-rmse:0.68353
fold n°5
[16:19:32] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/regression_obj.cu:171: reg:linear is now deprecated in favor of reg:squarederror.
[16:19:32] WARNING: C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/learner.cc:573:
Parameters: { "silent" } might not be used.
This may not be accurate due to some parameters are only used in language bindings but
passed down to XGBoost core. Or some parameters are not used but slip through this
verification. Please open an issue if you find above cases.
[0] train-rmse:3.39341 valid_data-rmse:3.42627
[500] train-rmse:0.53662 valid_data-rmse:0.66396
[1000] train-rmse:0.44475 valid_data-rmse:0.66454
[1153] train-rmse:0.42076 valid_data-rmse:0.66554
CV score: 0.47182704
GradientBoostingRegressor梯度提升决策树 - gbr_49
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2018)
oof_gbr_49 = np.zeros(train_shape)
predictions_gbr_49 = np.zeros(len(X_test_49))
#GradientBoostingRegressor梯度提升决策树
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_49, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_49[trn_idx]
tr_y = y_train[trn_idx]
gbr_49 = gbr(n_estimators=600, learning_rate=0.01,subsample=0.65,max_depth=6, min_samples_leaf=20,
max_features=0.35,verbose=1)
gbr_49.fit(tr_x,tr_y)
oof_gbr_49[val_idx] = gbr_49.predict(X_train_49[val_idx])
predictions_gbr_49 += gbr_49.predict(X_test_49) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_gbr_49, target)))
fold n°1
Iter Train Loss OOB Improve Remaining Time
1 0.6835 0.0031 11.99s
2 0.6553 0.0033 11.37s
3 0.6443 0.0029 11.75s
4 0.6608 0.0031 11.62s
5 0.6371 0.0033 11.67s
6 0.6441 0.0033 11.59s
7 0.6474 0.0029 11.70s
8 0.6159 0.0031 11.70s
9 0.6203 0.0027 11.69s
10 0.6468 0.0025 11.69s
20 0.6090 0.0023 11.31s
30 0.5858 0.0015 11.08s
40 0.5589 0.0015 10.90s
50 0.5280 0.0013 10.66s
60 0.5216 0.0011 10.44s
70 0.4894 0.0010 10.23s
80 0.4765 0.0007 10.03s
90 0.4760 0.0005 9.82s
100 0.4446 0.0006 9.62s
200 0.4025 0.0001 7.65s
300 0.3765 -0.0000 5.74s
400 0.3547 -0.0000 3.84s
500 0.3325 -0.0001 1.92s
600 0.3069 -0.0001 0.00s
fold n°2
Iter Train Loss OOB Improve Remaining Time
1 0.6757 0.0031 12.57s
2 0.6649 0.0033 12.26s
3 0.6457 0.0033 11.84s
4 0.6419 0.0032 11.85s
5 0.6584 0.0028 11.76s
6 0.6459 0.0031 11.83s
7 0.6548 0.0028 11.91s
8 0.6608 0.0027 11.88s
9 0.6245 0.0031 11.86s
10 0.6298 0.0028 11.77s
20 0.6049 0.0025 11.40s
30 0.5686 0.0020 11.16s
40 0.5642 0.0015 11.04s
50 0.5418 0.0013 10.80s
60 0.5213 0.0012 10.57s
70 0.5004 0.0009 10.32s
80 0.4770 0.0007 10.11s
90 0.4710 0.0006 9.92s
100 0.4434 0.0005 9.72s
200 0.4057 0.0001 7.76s
300 0.3678 0.0000 5.79s
400 0.3588 0.0000 3.87s
500 0.3302 -0.0001 1.93s
600 0.3020 -0.0001 0.00s
fold n°3
Iter Train Loss OOB Improve Remaining Time
1 0.6798 0.0030 11.38s
2 0.6737 0.0034 11.36s
3 0.6779 0.0028 11.54s
4 0.6306 0.0033 11.33s
5 0.6520 0.0031 11.43s
6 0.6499 0.0031 11.39s
7 0.6621 0.0028 11.44s
8 0.6497 0.0027 11.47s
9 0.6350 0.0031 11.49s
10 0.6232 0.0029 11.51s
20 0.5975 0.0024 11.23s
30 0.5694 0.0021 11.04s
40 0.5471 0.0016 10.84s
50 0.5253 0.0013 10.66s
60 0.5141 0.0012 10.45s
70 0.4925 0.0009 10.24s
80 0.4836 0.0007 10.03s
90 0.4556 0.0005 9.85s
100 0.4552 0.0004 9.65s
200 0.3957 -0.0000 7.68s
300 0.3631 -0.0000 5.74s
400 0.3340 0.0000 3.84s
500 0.3278 -0.0001 1.93s
600 0.3104 -0.0000 0.00s
fold n°4
Iter Train Loss OOB Improve Remaining Time
1 0.6743 0.0031 12.58s
2 0.6413 0.0036 12.26s
3 0.6702 0.0034 12.14s
4 0.6694 0.0032 12.00s
5 0.6509 0.0032 12.08s
6 0.6573 0.0033 11.93s
7 0.6402 0.0031 12.03s
8 0.6412 0.0030 11.99s
9 0.6351 0.0029 12.02s
10 0.6208 0.0031 11.98s
20 0.5897 0.0024 11.46s
30 0.5781 0.0021 11.20s
40 0.5572 0.0017 10.99s
50 0.5265 0.0014 10.75s
60 0.5238 0.0010 10.58s
70 0.4994 0.0010 10.39s
80 0.4896 0.0009 10.18s
90 0.4868 0.0006 9.97s
100 0.4639 0.0007 9.80s
200 0.3917 0.0000 7.78s
300 0.3726 -0.0000 5.82s
400 0.3448 -0.0001 3.88s
500 0.3199 -0.0001 1.94s
600 0.3066 -0.0000 0.00s
fold n°5
Iter Train Loss OOB Improve Remaining Time
1 0.6654 0.0032 11.38s
2 0.6548 0.0032 11.45s
3 0.6568 0.0031 11.54s
4 0.6561 0.0028 11.48s
5 0.6612 0.0032 11.55s
6 0.6512 0.0031 11.54s
7 0.6521 0.0030 11.57s
8 0.6546 0.0026 11.51s
9 0.6487 0.0026 11.59s
10 0.6375 0.0029 11.66s
20 0.5960 0.0022 11.33s
30 0.5722 0.0020 11.10s
40 0.5527 0.0017 10.88s
50 0.5345 0.0013 10.69s
60 0.5075 0.0012 10.46s
70 0.4937 0.0009 10.24s
80 0.4862 0.0007 10.03s
90 0.4704 0.0006 9.85s
100 0.4542 0.0006 9.65s
200 0.4056 -0.0000 7.69s
300 0.3705 -0.0000 5.76s
400 0.3419 0.0000 3.84s
500 0.3259 -0.0000 1.92s
600 0.3182 -0.0001 0.00s
CV score: 0.47226045
至此,我们得到了以上3种模型的基于49个特征的预测结果以及模型架构及参数。其中在每一种特征工程中,进行5折的交叉验证,并重复两次(Kernel Ridge Regression,核脊回归),取得每一个特征数下的模型的结果。
train_stack3 = np.vstack([oof_lgb_49,oof_xgb_49,oof_gbr_49]).transpose()
test_stack3 = np.vstack([predictions_lgb_49, predictions_xgb_49,predictions_gbr_49]).transpose()
#
folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=7)
oof_stack3 = np.zeros(train_stack3.shape[0])
predictions_lr3 = np.zeros(test_stack3.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack3,target)):
print("fold {}".format(fold_))
trn_data, trn_y = train_stack3[trn_idx], target.iloc[trn_idx].values
val_data, val_y = train_stack3[val_idx], target.iloc[val_idx].values
#Kernel Ridge Regression
lr3 = kr()
lr3.fit(trn_data, trn_y)
oof_stack3[val_idx] = lr3.predict(val_data)
predictions_lr3 += lr3.predict(test_stack3) / 10
mean_squared_error(target.values, oof_stack3)
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
0.46902141359403127
383维数据
Kernel Ridge Regression 基于核的岭回归 - kr_383
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_kr_383 = np.zeros(train_shape)
predictions_kr_383 = np.zeros(len(X_test_383))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_383, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_383[trn_idx]
tr_y = y_train[trn_idx]
#Kernel Ridge Regression 岭回归
kr_383 = kr()
kr_383.fit(tr_x,tr_y)
oof_kr_383[val_idx] = kr_383.predict(X_train_383[val_idx])
predictions_kr_383 += kr_383.predict(X_test_383) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_kr_383, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.52246300
使用普通岭回归 - ridge_383
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_ridge_383 = np.zeros(train_shape)
predictions_ridge_383 = np.zeros(len(X_test_383))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_383, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_383[trn_idx]
tr_y = y_train[trn_idx]
#使用岭回归
ridge_383 = Ridge(alpha=1200)
ridge_383.fit(tr_x,tr_y)
oof_ridge_383[val_idx] = ridge_383.predict(X_train_383[val_idx])
predictions_ridge_383 += ridge_383.predict(X_test_383) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_ridge_383, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.48758654
使用ElasticNet 弹性网络 - en_383
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_en_383 = np.zeros(train_shape)
predictions_en_383 = np.zeros(len(X_test_383))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_383, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_383[trn_idx]
tr_y = y_train[trn_idx]
#ElasticNet 弹性网络
en_383 = en(alpha=1.0,l1_ratio=0.06)
en_383.fit(tr_x,tr_y)
oof_en_383[val_idx] = en_383.predict(X_train_383[val_idx])
predictions_en_383 += en_383.predict(X_test_383) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_en_383, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.53650465
使用BayesianRidge 贝叶斯岭回归 - br_383
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_br_383 = np.zeros(train_shape)
predictions_br_383 = np.zeros(len(X_test_383))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_383, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_383[trn_idx]
tr_y = y_train[trn_idx]
#BayesianRidge 贝叶斯回归
br_383 = br()
br_383.fit(tr_x,tr_y)
oof_br_383[val_idx] = br_383.predict(X_train_383[val_idx])
predictions_br_383 += br_383.predict(X_test_383) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_br_383, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.48801112
至此,我们得到了以上4种模型的基于383个特征的预测结果以及模型架构及参数。其中在每一种特征工程中,进行5折的交叉验证,并重复两次(LinearRegression简单的线性回归),取得每一个特征数下的模型的结果。
train_stack1 = np.vstack([oof_br_383,oof_kr_383,oof_en_383,oof_ridge_383]).transpose()
test_stack1 = np.vstack([predictions_br_383, predictions_kr_383,predictions_en_383,predictions_ridge_383]).transpose()
folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=7)
oof_stack1 = np.zeros(train_stack1.shape[0])
predictions_lr1 = np.zeros(test_stack1.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack1,target)):
print("fold {}".format(fold_))
trn_data, trn_y = train_stack1[trn_idx], target.iloc[trn_idx].values
val_data, val_y = train_stack1[val_idx], target.iloc[val_idx].values
# LinearRegression简单的线性回归
lr1 = lr()
lr1.fit(trn_data, trn_y)
oof_stack1[val_idx] = lr1.predict(val_data)
predictions_lr1 += lr1.predict(test_stack1) / 10
mean_squared_error(target.values, oof_stack1)
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
0.48933964532334556
增加对49维的构建
KernelRidge 核岭回归 - kr_49
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_kr_49 = np.zeros(train_shape)
predictions_kr_49 = np.zeros(len(X_test_49))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_49, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_49[trn_idx]
tr_y = y_train[trn_idx]
kr_49 = kr()
kr_49.fit(tr_x,tr_y)
oof_kr_49[val_idx] = kr_49.predict(X_train_49[val_idx])
predictions_kr_49 += kr_49.predict(X_test_49) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_kr_49, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.50450139
Ridge 岭回归 - ridge_49
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_ridge_49 = np.zeros(train_shape)
predictions_ridge_49 = np.zeros(len(X_test_49))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_49, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_49[trn_idx]
tr_y = y_train[trn_idx]
ridge_49 = Ridge(alpha=6)
ridge_49.fit(tr_x,tr_y)
oof_ridge_49[val_idx] = ridge_49.predict(X_train_49[val_idx])
predictions_ridge_49 += ridge_49.predict(X_test_49) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_ridge_49, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.49587680
BayesianRidge 贝叶斯岭回归 - br_49
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_br_49 = np.zeros(train_shape)
predictions_br_49 = np.zeros(len(X_test_49))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_49, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_49[trn_idx]
tr_y = y_train[trn_idx]
br_49 = br()
br_49.fit(tr_x,tr_y)
oof_br_49[val_idx] = br_49.predict(X_train_49[val_idx])
predictions_br_49 += br_49.predict(X_test_49) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_br_49, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.49684965
ElasticNet 弹性网络 - en_49
folds = KFold(n_splits=5, shuffle=True, random_state=13)
oof_en_49 = np.zeros(train_shape)
predictions_en_49 = np.zeros(len(X_test_49))
#
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_49, y_train)):
print("fold n°{}".format(fold_+1))
tr_x = X_train_49[trn_idx]
tr_y = y_train[trn_idx]
en_49 = en(alpha=1.0,l1_ratio=0.05)
en_49.fit(tr_x,tr_y)
oof_en_49[val_idx] = en_49.predict(X_train_49[val_idx])
predictions_en_49 += en_49.predict(X_test_49) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_en_49, target)))
fold n°1
fold n°2
fold n°3
fold n°4
fold n°5
CV score: 0.53979354
我们得到了以上4种新模型的基于49个特征的预测结果以及模型架构及参数。其中在每一种特征工程中,进行5折的交叉验证,并重复两次(LinearRegression简单的线性回归),取得每一个特征数下的模型的结果。
train_stack4 = np.vstack([oof_br_49,oof_kr_49,oof_en_49,oof_ridge_49]).transpose()
test_stack4 = np.vstack([predictions_br_49, predictions_kr_49,predictions_en_49,predictions_ridge_49]).transpose()
folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=7)
oof_stack4 = np.zeros(train_stack4.shape[0])
predictions_lr4 = np.zeros(test_stack4.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack4,target)):
print("fold {}".format(fold_))
trn_data, trn_y = train_stack4[trn_idx], target.iloc[trn_idx].values
val_data, val_y = train_stack4[val_idx], target.iloc[val_idx].values
#LinearRegression
lr4 = lr()
lr4.fit(trn_data, trn_y)
oof_stack4[val_idx] = lr4.predict(val_data)
predictions_lr4 += lr4.predict(test_stack1) / 10
mean_squared_error(target.values, oof_stack4)
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
0.4962282660242021
模型融合
这里对于上述四种集成学习的模型的预测结果进行加权的求和,得到最终的结果,当然这种方式是很不准确的。
#和下面作对比
mean_squared_error(target.values, 0.7*(0.6*oof_stack2 + 0.4*oof_stack3)+0.3*(0.55*oof_stack1+0.45*oof_stack4))
0.45414739861837033
更好的方式是将以上的4中集成学习模型再次进行集成学习的训练,这里直接使用LinearRegression简单线性回归的进行集成。
train_stack5 = np.vstack([oof_stack1,oof_stack2,oof_stack3,oof_stack4]).transpose()
test_stack5 = np.vstack([predictions_lr1, predictions_lr2,predictions_lr3,predictions_lr4]).transpose()
folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=7)
oof_stack5 = np.zeros(train_stack5.shape[0])
predictions_lr5= np.zeros(test_stack5.shape[0])
for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack5,target)):
print("fold {}".format(fold_))
trn_data, trn_y = train_stack5[trn_idx], target.iloc[trn_idx].values
val_data, val_y = train_stack5[val_idx], target.iloc[val_idx].values
#LinearRegression
lr5 = lr()
lr5.fit(trn_data, trn_y)
oof_stack5[val_idx] = lr5.predict(val_data)
predictions_lr5 += lr5.predict(test_stack5) / 10
mean_squared_error(target.values, oof_stack5)
fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
0.4493605489329375
这个分数较大多MSE还是有所提升的
结果保存
进行index的读取工作
submit_example = pd.read_csv('submit_example.csv',sep=',',encoding='latin-1')
submit_example['happiness'] = predictions_lr5
submit_example.happiness.describe()
count 2968.00000
mean 3.88106
std 0.46003
min 1.62602
25% 3.66777
50% 3.95277
75% 4.18658
max 5.00527
Name: happiness, dtype: float64
进行结果保存,这里我们预测出的值是1-5的连续值,但是我们的ground truth是整数值,所以为了进一步优化我们的结果,我们对于结果进行了整数解的近似,并保存到了csv文件中。
submit_example.loc[submit_example['happiness']>4.96,'happiness']= 5
submit_example.loc[submit_example['happiness']<=1.04,'happiness']= 1
submit_example.loc[(submit_example['happiness']>1.96)&(submit_example['happiness']<2.04),'happiness']= 2
submit_example.to_csv("submision.csv",index=False)
submit_example.happiness.describe()
count 2968.00000
mean 3.88107
std 0.46005
min 1.62602
25% 3.66777
50% 3.95277
75% 4.18658
max 5.00000
Name: happiness, dtype: float64
以上以复现代码为主,剩下有关更多特征工程以及模型调参,可以进一步探索。