一、学习知识点概要
1.1 学习内容介绍
阿里天池挖掘幸福感比赛入口: https://tianchi.aliyun.com/competition/entrance/231702/introduction
1.2 学习目标
1.3 代码流程
1.数据探索 2.特征工程 3.建模预测 4.模型调参与融合
二、学习内容
1. 数据探索
2. 特征工程
import numpy as np
import pandas as pd
pd. set_option( 'display.max_columns' , None )
pd. set_option( 'display.max_rows' , None )
train_data = pd. read_csv( './input/happiness_train_complete.csv' , encoding= 'gbk' )
test_data = pd. read_csv( './input/happiness_test_complete.csv' , encoding= 'gbk' )
y_train = train_data[ 'happiness' ]
y_train = y_train. apply ( lambda x: 3 if x == - 8 else x)
train = train_data. drop( columns= [ 'happiness' ] )
data = pd. concat( [ train, test_data] , axis= 0 )
drop_cols = [ 'survey_time' , 'edu_other' , 'property_other' , 'invest_other' , 'id' ]
data. drop( drop_cols, axis= 1 , inplace= True )
def replace_with_nan ( x) :
if x < 0 :
return float ( 'NaN' )
else :
return x
data= data. applymap( replace_with_nan)
data= data. fillna( round ( data. median( ) , 0 ) )
train = data[ : train. shape[ 0 ] ]
train[ 'happiness' ] = y_train
test = data[ train. shape[ 0 ] : ]
train. to_csv( './output/train_data.csv' , index= False )
test. to_csv( './output/test_data.csv' , index= False )
3. 建模预测
import numpy as np
import pandas as pd
from sklearn. metrics import mean_squared_error
from sklearn. metrics import mean_absolute_error
from sklearn. model_selection import KFold
import warnings
warnings. filterwarnings( 'ignore' )
train_data = pd. read_csv( './output/train_data.csv' )
test_data = pd. read_csv( './output/test_data.csv' )
X_train = train_data. drop( [ 'happiness' ] , axis= 1 ) . values
y_train = train_data[ 'happiness' ] . values
X_test = test_data. values
def train_by_cv_xgb ( X_train, y_train, X_test, n_splits) :
import xgboost as xgb
xgb_params = {
"booster" : 'gbtree' ,
'eta' : 0.005 ,
'max_depth' : 7 ,
'subsample' : 0.7 ,
'colsample_bytree' : 0.8 ,
'objective' : 'reg:linear' ,
'eval_metric' : 'rmse' ,
'silent' : True ,
'nthread' : 8
}
folds = KFold( n_splits= n_splits, shuffle= True , random_state= 2021 )
oof_xgb = np. zeros( X_train. shape[ 0 ] )
predictions_xgb = np. zeros( X_test. shape[ 0 ] )
for fold_, ( trn_idx, val_idx) in enumerate ( folds. split( X_train, y_train) ) :
print ( "fold n°{}" . format ( fold_+ 1 ) )
trn_data = xgb. DMatrix( X_train[ trn_idx] , y_train[ trn_idx] )
val_data = xgb. DMatrix( X_train[ val_idx] , y_train[ val_idx] )
watchlist = [ ( trn_data, 'train' ) , ( val_data, 'valid_data' ) ]
clf = xgb. train( dtrain= trn_data, num_boost_round= 20000 , evals= watchlist, early_stopping_rounds= 200 , verbose_eval= 100 , params= xgb_params)
oof_xgb[ val_idx] = clf. predict( xgb. DMatrix( X_train[ val_idx] ) , ntree_limit= clf. best_ntree_limit)
predictions_xgb += clf. predict( xgb. DMatrix( X_test) , ntree_limit= clf. best_ntree_limit) / folds. n_splits
print ( "XGB CV score: {:<8.8f}" . format ( mean_squared_error( oof_xgb, y_train) ) )
return oof_xgb, predictions_xgb
def train_by_cv_lgb ( X_train, y_train, X_test, n_splits) :
from sklearn. model_selection import KFold
import lightgbm as lgb
param = {
'boosting_type' : 'gbdt' ,
'num_leaves' : 31 ,
'min_data_in_leaf' : 20 ,
'objective' : 'regression' ,
'max_depth' : 7 ,
'learning_rate' : 0.005 ,
"min_child_samples" : 30 ,
"feature_fraction" : 0.8 ,
"bagging_freq" : 1 ,
"bagging_fraction" : 0.8 ,
"bagging_seed" : 11 ,
"metric" : 'mse' ,
"lambda_l1" : 0.1 ,
"verbosity" : - 1
}
folds = KFold( n_splits= n_splits, shuffle= True , random_state= 2021 )
oof_lgb = np. zeros( X_train. shape[ 0 ] )
predictions_lgb = np. zeros( X_test. shape[ 0 ] )
for fold_, ( trn_idx, val_idx) in enumerate ( folds. split( X_train, y_train) ) :
print ( "fold n°{}" . format ( fold_+ 1 ) )
trn_data = lgb. Dataset( X_train[ trn_idx] , y_train[ trn_idx] )
val_data = lgb. Dataset( X_train[ val_idx] , y_train[ val_idx] )
num_round = 20000
clf = lgb. train( param, trn_data, num_round, valid_sets = [ trn_data, val_data] , verbose_eval= 200 , early_stopping_rounds = 100 )
oof_lgb[ val_idx] = clf. predict( X_train[ val_idx] , num_iteration= clf. best_iteration)
predictions_lgb += clf. predict( X_test, num_iteration= clf. best_iteration) / folds. n_splits
print ( "LGB CV score: {:<8.8f}" . format ( mean_squared_error( oof_lgb, y_train) ) )
return oof_lgb, predictions_lgb
def train_by_cv_cb ( X_train, y_train, X_test, n_splits) :
from catboost import Pool, CatBoostRegressor
folds = KFold( n_splits= 10 , shuffle= True , random_state= 2021 )
oof_cb = np. zeros( X_train. shape[ 0 ] )
predictions_cb = np. zeros( X_test. shape[ 0 ] )
cb_params = {
'n_estimators' : 20000 ,
'loss_function' : 'RMSE' ,
'eval_metric' : 'RMSE' ,
'learning_rate' : 0.005 ,
'depth' : 7 ,
'use_best_model' : True ,
'subsample' : 0.6 ,
'bootstrap_type' : 'Bernoulli' ,
'reg_lambda' : 3
}
for fold_, ( trn_idx, val_idx) in enumerate ( folds. split( X_train, y_train) ) :
print ( "fold n°{}" . format ( fold_+ 1 ) )
trn_X, trn_y = X_train[ trn_idx] , y_train[ trn_idx]
val_X, val_y = X_train[ val_idx] , y_train[ val_idx]
model_cb = CatBoostRegressor( ** cb_params)
model_cb. fit( trn_X, trn_y, eval_set= [ ( val_X, val_y) ] , verbose= 100 , early_stopping_rounds= 50 )
oof_cb[ val_idx] = model_cb. predict( val_X, ntree_end= model_cb. best_iteration_)
predictions_cb += model_cb. predict( X_test, ntree_end= model_cb. best_iteration_) / folds. n_splits
print ( "CB CV score: {:<8.8f}" . format ( mean_squared_error( oof_cb, y_train) ) )
return oof_cb, predictions_cb
oof_lgb, predictions_lgb = train_by_cv_lgb( X_train, y_train, X_test, 10 )
oof_xgb, predictions_xgb = train_by_cv_xgb( X_train, y_train, X_test, 10 )
oof_cb, predictions_cb = train_by_cv_cb( X_train, y_train, X_test, 10 )
4. 模型调参与融合
def train_by_cv_xgb_lgb_cb_stack ( oof_lgb, oof_xgb, oof_cb, predictions_lgb, predictions_xgb, predictions_cb, n_splits, n_repeats) :
from sklearn import linear_model
from sklearn. model_selection import RepeatedKFold
train_stack = np. vstack( [ oof_lgb, oof_xgb, oof_cb] ) . transpose( )
test_stack = np. vstack( [ predictions_lgb, predictions_xgb, predictions_cb] ) . transpose( )
folds_stack = RepeatedKFold( n_splits= n_splits, n_repeats= n_repeats, random_state= 2021 )
oof_stack = np. zeros( train_stack. shape[ 0 ] )
predictions = np. zeros( test_stack. shape[ 0 ] )
for fold_, ( trn_idx, val_idx) in enumerate ( folds_stack. split( train_stack, y_train) ) :
print ( "fold {}" . format ( fold_) )
trn_data, trn_y = train_stack[ trn_idx] , y_train[ trn_idx]
val_data, val_y = train_stack[ val_idx] , y_train[ val_idx]
clf_3 = linear_model. BayesianRidge( )
clf_3. fit( trn_data, trn_y)
oof_stack[ val_idx] = clf_3. predict( val_data)
predictions += clf_3. predict( test_stack) / ( n_splits * n_repeats)
print ( "CV score: {:<8.8f}" . format ( mean_squared_error( oof_stack, y_train) ) )
return oof_stack, predictions
oof_stack, predictions = train_by_cv_xgb_lgb_cb_stack( oof_lgb, oof_xgb, oof_cb, predictions_lgb, predictions_xgb, predictions_cb, n_splits= 10 , n_repeats= 2 )
submit_data = pd. read_csv( './input/happiness_submit.csv' )
submit_data[ 'happiness' ] = predictions
submit_data. to_csv( './result/res.csv' , index= False )
三、学习问题与解决
1.在拿到一份数据不要慌,我刚拿到数据就慌了,这140个属性,怎么分析啊,在自己冷静之后决定先出一个baseline,只做最简单的一些处理 2.我也完全是一个初学者,接触机器学习不到半年,对数据分析这一块也不太懂,所以更深入的特征还需要继续学习,也希望有人能够指出我的问题
四、总结
1.首先拿到这个数据,完全看不懂,只能做一个简单的探索分析,进行一个简单的处理 2.快速形成一个baseline, 进行模型融合发现还不错 3.接下来就应该是对每一个特征进行分析,构建新特征,不断的发现与试错