项目介绍:
一、目的:幸福感是一个古老而深刻的话题,是人类世代追求的方向。与幸福感相关的因素成千上万、因人而异,大如国计民生,小如路边烤红薯,都会对幸福感产生影响。这些错综复杂的因素中,我们能找到其中的共性,一窥幸福感的要义吗?我们通过模型调优,提高模型准确度。
二、开发环境:Python 3.6
三、代码
地址:https://tianchi.aliyun.com/competition/entrance/231702/rankingList
1、导入库
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
import lightgbm as lgb #使用了 lightgbm、xgboost 、catboost 三种算法
import xgboost as xgb
from catboost import Pool, CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import KFold, RepeatedKFold
from scipy import sparse
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
from datetime import datetime
2、导入数据
train=pd.read_csv("C:\英雄时刻\Python\天池,赛
test=pd.read_csv(r"C:\英雄时刻\Python\天池,赛
data = pd.concat([train,test],axis=0,ignore_index=True)
data.info(verbose=True,null_counts=True)
y_train_=train["happiness"]
y_train_.value_counts()
y_train_=y_train_.map(lambda x:3 if x==-8 else x)
y_train_=y_train_.map(lambda x:x-1)
3、处理时间特征
data['survey_time'] = pd.to_datetime(data['survey_time'],format='%Y-%m-%d %H:%M:%S')
data["weekday"]=data["survey_time"].dt.weekday
data["year"]=data["survey_time"].dt.year
data["quarter"]=data["survey_time"].dt.quarter
data["hour"]=data["survey_time"].dt.hour
data["month"]=data["survey_time"].dt.month
#把一天的时间分段
def hour_cut(x):
if 0<=x<6:
return 0
elif 6<=x<8:
return 1
elif 8<=x<12:
return 2
elif 12<=x<14:
return 3
elif 14<=x<18:
return 4
elif 18<=x<21:
return 5
elif 21<=x<24:
return 6
data["hour_cut"]=data["hour"].map(hour_cut)
data["survey_age"]=data["year"]-data["birth"]
data["happiness"]=data["happiness"].map(lambda x:x-1)
4、处理缺失值及数据填充
#去掉三个缺失值很多的
data=data.drop(["edu_other"], axis=1)
data=data.drop(["happiness"], axis=1)
data=data.drop(["survey_time"], axis=1)
data["join_party"]=data["join_party"].map(lambda x:0 if pd.isnull(x) else 1)
#出生的年代
def birth_split(x):
if 1920<=x<=1930:
return 0
elif 1930<x<=1940:
return 1
elif 1940<x<=1950:
return 2
elif 1950<x<=1960:
return 3
elif 1960<x<=1970:
return 4
elif 1970<x<=1980:
return 5
elif 1980<x<=1990:
return 6
elif 1990<x<=2000:
return 7
data["birth_s"]=data["birth"].map(birth_split)
#收入分组
def income_cut(x):
if x<0:
return 0
elif 0<=x<1200:
return 1
elif 1200<x<=10000:
return 2
elif 10000<x<24000:
return 3
elif 24000<x<40000:
return 4
elif 40000<=x:
return 5
data["income_cut"]=data["income"].map(income_cut)
#填充数据
data["edu_status"]=data["edu_status"].fillna(5)
data["edu_yr"]=data["edu_yr"].fillna(-2)
data["property_other"]=data["property_other"].map(lambda x:0 if pd.isnull(x) else 1)
data["hukou_loc"]=data["hukou_loc"].fillna(1)
data["social_neighbor"]=data["social_neighbor"].fillna(8)
data["social_friend"]=data["social_friend"].fillna(8)
data["work_status"]=data["work_status"].fillna(0)
data["work_yr"]=data["w
ork_yr"].fillna(0)
data["work_type"]=data["work_type"].fillna(0)
data["work_manage"]=data["work_manage"].fillna(0)
data["family_income"]=data["family_income"].fillna(-2)
data["invest_other"]=data["invest_other"].map(lambda x:0 if pd.isnull(x) else 1)
#填充数据
data["minor_child"]=data["minor_child"].fillna(0)
data["marital_1st"]=data["marital_1st"].fillna(0)
data["s_birth"]=data["s_birth"].fillna(0)
data["marital_now"]=data["marital_now"].fillna(0)
data["s_edu"]=data["s_edu"].fillna(0)
data["s_political"]=data["s_political"].fillna(0)
data["s_hukou"]=data["s_hukou"].fillna(0)
data["s_income"]=data["s_income"].fillna(0)
data["s_work_exper"]=data["s_work_exper"].fillna(0)
data["s_work_status"]=data["s_work_status"].fillna(0)
data["s_work_type"]=data["s_work_type"].fillna(0)
data['income_cut'].fillna(0,inplace=True)
data=data.drop(["id"], axis=1)
X_train_ = data[:train.shape[0]]
X_test_ = data[train.shape[0]:]
5、建模
X_train = np.array(X_train_) #注意最好把数据转换为“np.array格式”
y_train = np.array(y_train_)
X_test = np.array(X_test_)
gg=data.iloc[1].index.tolist()
def myFeval(preds, xgbtrain):
label = xgbtrain.get_label()
score = mean_squared_error(label,preds)
return 'myFeval',score
5.1、利用xgboost建模
xgb_params = {"booster":'gbtree','eta': 0.005, 'max_depth': 5, 'subsample': 0.7,
'colsample_bytree': 0.8, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'silent': True, 'nthread': 8}
folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_xgb= np.zeros(len(train))
predictions_xgb = np.zeros(len(test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
print("fold n°{}".format(fold_+1))
trn_data = xgboost.DMatrix(x_train[trn_idx], y_train[trn_idx])
val_data = xgboost.DMatrix(x_train[val_idx], y_train[val_idx])
watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
clf = xgboost.train(dtrain=trn_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=100, params=xgb_params,feval =myFeval )
oof_xgb[val_idx] = clf.predict(xgb.DMatrix(x_train[val_idx]), ntree_limit=clf.best_ntree_limit)
predictions_xgb = clf.predict(xgb.DMatrix(x_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits
print(mean_squared_error(oof_xgb, y_train_))
print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, y_train_)))
5.2、利用lightgbm 建模
param = {'boosting_type': 'gbdt',
'num_leaves': 20,
'min_data_in_leaf': 20,
'objective':'regression',
'max_depth':6,
'learning_rate': 0.01,
"min_child_samples": 30,
"feature_fraction": 0.8,
"bagging_freq": 1,
"bagging_fraction": 0.8 ,
"bagging_seed": 11,
"metric": 'mse',
"lambda_l1": 0.1,
"verbosity": -1}
folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_lgb = np.zeros(len(x_train))
predictions_lgb = np.zeros(len(x_test))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(x_train, y_train)):
print("fold n°{}".format(fold_+1))
# print(trn_idx)
# print(".............x_train.........")
# print(X_train[trn_idx])
# print(".............y_train.........")
# print(y_train[trn_idx])
trn_data = lgb.Dataset(x_train[trn_idx], y_train[trn_idx])
val_data = lgb.Dataset(x_train[val_idx], y_train[val_idx])
num_round = 10000
clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=200, early_stopping_rounds = 100)
oof_lgb[val_idx] = clf.predict(x_train[val_idx], num_iteration=clf.best_iteration)
predictions_lgb += clf.predict(x_test, num_iteration=clf.best_iteration) / folds.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb, y_train_)))
5.3、利用 catboost 建模
from catboost import Pool, CatBoostRegressor
# cat_features=[0,2,3,10,11,13,15,16,17,18,19]
from sklearn.model_selection import train_test_split
#X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_train_, y_train_, test_size=0.3, random_state=2019)
# train_pool = Pool(X_train_s, y_train_s,cat_features=[0,2,3,10,11,13,15,16,17,18,19])
# val_pool = Pool(X_test_s, y_test_s,cat_features=[0,2,3,10,11,13,15,16,17,18,19])
# test_pool = Pool(X_test_ ,cat_features=[0,2,3,10,11,13,15,16,17,18,19])
kfolder = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_cb = np.zeros(len(x_train))
predictions_cb = np.zeros(len(x_test))
kfold = kfolder.split(x_train, y_train)
fold_=0
#X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_train, y_train, test_size=0.3, random_state=2019)
for train_index, vali_index in kfold:
print("fold n°{}".format(fold_))
fold_=fold_+1
k_x_train = x_train[train_index]
k_y_train = y_train[train_index]
k_x_vali = x_train[vali_index]
k_y_vali = y_train[vali_index]
cb_params = {
'n_estimators': 100000,
'loss_function': 'RMSE',
'eval_metric':'RMSE',
'learning_rate': 0.05,
'depth': 5,
'use_best_model': True,
'subsample': 0.6,
'bootstrap_type': 'Bernoulli',
'reg_lambda': 3
}
model_cb = CatBoostRegressor(**cb_params)
#train the model
model_cb.fit(k_x_train, k_y_train,eval_set=[(k_x_vali, k_y_vali)],verbose=100,early_stopping_rounds=50)
oof_cb[vali_index] = model_cb.predict(k_x_vali, ntree_end=model_cb.best_iteration_)
predictions_cb += model_cb.predict(x_test, ntree_end=model_cb.best_iteration_) / kfolder.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_cb, y_train_)))
5.4、利用模型提升提高模型精确度
from catboost import Pool, CatBoostRegressor
from catboost import Pool, CatBoostRegressor
# cat_features=[0,2,3,10,11,13,15,16,17,18,19]
from sklearn.model_selection import train_test_split
#X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_train_, y_train_, test_size=0.3, random_state=2019)
# train_pool = Pool(X_train_s, y_train_s,cat_features=[0,2,3,10,11,13,15,16,17,18,19])
# val_pool = Pool(X_test_s, y_test_s,cat_features=[0,2,3,10,11,13,15,16,17,18,19])
# test_pool = Pool(X_test_ ,cat_features=[0,2,3,10,11,13,15,16,17,18,19])
kfolder = KFold(n_splits=5, shuffle=True, random_state=2019)
oof_cb = np.zeros(len(X_train_))
predictions_cb = np.zeros(len(X_test_))
kfold = kfolder.split(X_train_, y_train_)
fold_=0
#X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_train, y_train, test_size=0.3, random_state=2019)
for train_index, vali_index in kfold:
print("fold n°{}".format(fold_))
fold_=fold_+1
k_x_train = X_train[train_index]
k_y_train = y_train[train_index]
k_x_vali = X_train[vali_index]
k_y_vali = y_train[vali_index]
cb_params = {
'n_estimators': 100000,
'loss_function': 'RMSE',
'eval_metric':'RMSE',
'learning_rate': 0.05,
'depth': 5,
'use_best_model': True,
'subsample': 0.6,
'bootstrap_type': 'Bernoulli',
'reg_lambda': 3
}
model_cb = CatBoostRegressor(**cb_params)
#train the model
model_cb.fit(k_x_train, k_y_train,eval_set=[(k_x_vali, k_y_vali)],verbose=100,early_stopping_rounds=50)
oof_cb[vali_index] = model_cb.predict(k_x_vali, ntree_end=model_cb.best_iteration_)
predictions_cb += model_cb.predict(X_test_, ntree_end=model_cb.best_iteration_) / kfolder.n_splits
print("CV score: {:<8.8f}".format(mean_squared_error(oof_cb, y_train_)))
六、总结
1、模型选择对于取得一个相对不错的成绩有很大帮助,但是在达到一定的的分数后,对于模型的调优,特征的修改就很难进一步提高分数,这个时候真正能大幅提高分数的是特征的创造,及通过对于业务的深刻理解,找到真正能够影响模型的变量,在该比赛的最高分数已经达到 0.00000 ,这必然是不可能通过对模型的改变实现的。