幸福感是一个古老而深刻的话题,是人类世代追求的方向。与幸福感相关的因素成千上万、因人而异,大如国计民生,小如路边烤红薯,都会对幸福感产生影响。这些错综复杂的因素中,我们能找到其中的共性,一窥幸福感的要义吗?
今天介绍使用Xgboost的方法来预测幸福感。
运行环境py37、jupyter notebook、anaconda3
import numpy as np
import pandas as pd
导入一下pandas和numpy
test=pd.read_csv('happiness_train_complete.csv',encoding='gbk')
test.describe()
获取数据,描述一下数据集
可以看到有8000组数据,135个特征变量以及1个指标变量。
class_mapping = {label:idx for idx,label in enumerate(set(test['edu_other']))}
test['edu_other'] = test['edu_other'].map(class_mapping)
class_mapping = {label:idx for idx,label in enumerate(set(test['property_other']))}
test['property_other'] = test['property_other'].map(class_mapping)
class_mapping = {label:idx for idx,label in enumerate(set(test['invest_other']))}
test['invest_other'] = test['invest_other'].map(class_mapping)
由于原数据里面有字符型数据,使用map方法替换掉字符型数据
test['survey_time']=pd.to_datetime(test['survey_time'])
test['survey_time']= test['survey_time'].dt.year
test.head()
原数据里面survey_time是年/月/日形式,把它改为按年计算。
test['age']=test['survey_time']-test['birth']
test['edu_age']=test['survey_time']-test['edu_yr']
test['edu_age'].loc[test['edu_age']>100] = np.nan
test['edu_age'].head()
test['marital_1st_age']=test['survey_time']-test['marital_1st']
test['marital_1st_age'].loc[test['marital_1st_age']>100] = np.nan
test['marital_1st_age'].head()
test['marital_now_age']=test['survey_time']-test['marital_now']
test['marital_now_age'].loc[test['marital_now_age']>100] = np.nan
test['marital_now_age'].head()
test['f_age']=test['survey_time']-test['f_birth']
test['f_age'].loc[test['f_age']>100] = np.nan
test['f_age'].head()
test['m_age']=test['survey_time']-test['m_birth']
test['m_age'].loc[test['m_age']>100] = np.nan
test['m_age'].head()
原数据里只给了出生年份,我们用survey_time减去birth得到年龄,由于数据是调查问卷得到的,有些不愿意填写的人使用-1、-2标记,对于这样计算年龄会得到一个较大的值,因此把大于100的值用NAN替换。
test=test.drop('edu_yr',axis=1)
test=test.drop('marital_1st',axis=1)
test=test.drop('marital_now',axis=1)
test=test.drop('f_birth',axis=1)
test=test.drop('m_birth',axis=1)
test=test.drop('birth',axis=1)
把代表年份的数据剔除,因为已经计算了年龄。
X=test.drop('happiness',axis=1)
X=test.drop('survey_time',axis=1)
y=test['happiness']
设置X和y。
from sklearn.preprocessing import StandardScaler
for column in X.columns:
X[column] = StandardScaler().fit_transform(X[column].values.reshape(-1, 1))
对特征向量标准化处理。
import xgboost as xgb
import pickle
import sys
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from scipy.sparse import csr_matrix, hstack
from sklearn.model_selection import KFold, train_test_split
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
导入Xgboost以及GridSearch和常用的机器学习包
dtrain = xgb.DMatrix(X, y)
xgb_params = {
'seed': 0,
'eta': 0.1,
'colsample_bytree': 0.5,
'silent': 0,
'subsample': 0.5,
'objective': 'reg:linear',
'max_depth': 5,
'min_child_weight': 3
}
%%time
def xg_eval_mae(yhat, dtrain):
Y = dtrain.get_label()
return 'mae', mean_absolute_error(np.exp(Y), np.exp(yhat))
bst_cv1 = xgb.cv(xgb_params, dtrain, num_boost_round=50, nfold=3, seed=0,
feval=xg_eval_mae, maximize=False, early_stopping_rounds=10)
print ('CV score:', bst_cv1.iloc[-1,:]['test-mae-mean'])
使用Xgboost拟合一下X,y,得到交叉验证分数如下:
CV score: 27.177993999999998
plt.figure()
bst_cv1[['train-mae-mean', 'test-mae-mean']].plot()
对训练集和测试集绘图,看一下是否有过拟合线象。
可以看到在树的数量到了20之后,会有少量过拟合。
%%time
bst_cv2 = xgb.cv(xgb_params, dtrain, num_boost_round=100,
nfold=3, seed=0, feval=xg_eval_mae, maximize=False,
early_stopping_rounds=10)
print ('CV score:', bst_cv2.iloc[-1,:]['test-mae-mean'])
对树的个数为100的情况下进行一下交叉验证。
CV score: 27.40789266666667
fig, (ax1, ax2) = plt.subplots(1,2)
fig.set_size_inches(16,4)
ax1.set_title('100 rounds of training')
ax1.set_xlabel('Rounds')
ax1.set_ylabel('Loss')
ax1.grid(True)
ax1.plot(bst_cv2[['train-mae-mean', 'test-mae-mean']])
ax1.legend(['Training Loss', 'Test Loss'])
ax2.set_title('60 last rounds of training')
ax2.set_xlabel('Rounds')
ax2.set_ylabel('Loss')
ax2.grid(True)
ax2.plot(bst_cv2.iloc[40:][['train-mae-mean', 'test-mae-mean']])
ax2.legend(['Training Loss', 'Test Loss'])
绘制一下总体以及后60个树的过拟合图像
class XGBoostRegressor(object):
def __init__(self, **kwargs):
self.params = kwargs
if 'num_boost_round' in self.params:
self.num_boost_round = self.params['num_boost_round']
self.params.update({'silent': 1, 'objective': 'reg:linear', 'seed': 0})
def fit(self, x_train, y_train):
dtrain = xgb.DMatrix(x_train, y_train)
self.bst = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round,
feval=xg_eval_mae, maximize=False)
def predict(self, x_pred):
dpred = xgb.DMatrix(x_pred)
return self.bst.predict(dpred)
def kfold(self, x_train, y_train, nfold=5):
dtrain = xgb.DMatrix(x_train, y_train)
cv_rounds = xgb.cv(params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round,
nfold=nfold, feval=xg_eval_mae, maximize=False, early_stopping_rounds=10)
return cv_rounds.iloc[-1,:]
def plot_feature_importances(self):
feat_imp = pd.Series(self.bst.get_fscore()).sort_values(ascending=False)
feat_imp.plot(title='Feature Importances')
plt.ylabel('Feature Importance Score')
def get_params(self, deep=True):
return self.params
def set_params(self, **params):
self.params.update(params)
return self
声明一下XgboostRegressor
def mae_score(y_true, y_pred):
return mean_absolute_error(np.exp(y_true), np.exp(y_pred))
mae_scorer = make_scorer(mae_score, greater_is_better=False)
定义一个得分函数。
bst = XGBoostRegressor(eta=0.1, colsample_bytree=0.5, subsample=0.5,
max_depth=5, min_child_weight=3, num_boost_round=50)
bst.kfold(X, y, nfold=5)
对以上参数做一个交叉验证并计算得分:
train-mae-mean 24.681992
train-mae-std 0.214132
train-rmse-mean 0.661918
train-rmse-std 0.010127
test-mae-mean 27.068168
test-mae-std 0.746192
test-rmse-mean 0.807352
test-rmse-std 0.058963
Name: 49, dtype: float64
xgb_param_grid = {'max_depth': list(range(4,9)), 'min_child_weight': list((1,3,6))}
grid = GridSearchCV(XGBoostRegressor(eta=0.1, num_boost_round=50, colsample_bytree=0.5, subsample=0.5),
param_grid=xgb_param_grid, cv=5, scoring=mae_scorer)
grid.fit(X, y)
对max_depth、min_child_weight进行调参,拟合一下数据。
grid.best_params_, grid.best_score_
best_params_返回的是最佳参数,best_score_返回最佳参数得分。
({'max_depth': 4, 'min_child_weight': 6}, -27.189371616740114)
def convert_grid_scores(scores):
_params = []
_params_mae = []
for i in scores:
_params.append(i[0].values())
_params_mae.append(i[1])
params = np.array(_params)
grid_res = np.column_stack((_params,_params_mae))
return [grid_res[:,i] for i in range(grid_res.shape[1])]
scores = grid.cv_results_['mean_test_score']
定义得分函数,规则为交叉验证的平均得分。
xgb_param_grid = {'gamma':[ 0.1 * i for i in range(0,5)]}
grid = GridSearchCV(XGBoostRegressor(eta=0.1, num_boost_round=50, max_depth=4, min_child_weight=6,
colsample_bytree=0.5, subsample=0.5),
param_grid=xgb_param_grid, cv=5, scoring=mae_scorer)
grid.fit(X, y)
grid.best_params_, grid.best_score_
对gamma值调参,返回最佳参数以及得分:
({'gamma': 0.2}, -27.17756873995668)
xgb_param_grid = {'subsample':[ 0.1 * i for i in range(6,9)],
'colsample_bytree':[ 0.1 * i for i in range(6,9)]}
grid = GridSearchCV(XGBoostRegressor(eta=0.1, gamma=0.2, num_boost_round=50, max_depth=4, min_child_weight=6),
param_grid=xgb_param_grid, cv=5, scoring=mae_scorer)
grid.fit(X, y)
grid.best_params_, grid.best_score_
对subsample以及colsample_bytree调参。
({'colsample_bytree': 0.7000000000000001, 'subsample': 0.6000000000000001},
-26.995863945755023)
xgb_param_grid = {'eta':[0.5,0.4,0.3,0.2,0.1,0.075,0.05,0.04,0.03]}
grid = GridSearchCV(XGBoostRegressor(num_boost_round=50, gamma=0.2, max_depth=4, min_child_weight=6,
colsample_bytree=0.7, subsample=0.6),
param_grid=xgb_param_grid, cv=5, scoring=mae_scorer)
grid.fit(X,y)
grid.best_params_, grid.best_score_
对学习率调参。
结果:({'eta': 0.1}, -26.995863945755023)
xgb_param_grid = {'eta':[x for x in np.linspace(0,0.2,20)]}
grid = GridSearchCV(XGBoostRegressor(num_boost_round=100, gamma=0.2, max_depth=4, min_child_weight=6,
colsample_bytree=0.7, subsample=0.6),
param_grid=xgb_param_grid, cv=5, scoring=mae_scorer)
grid.fit(X, y)
grid.best_params_, grid.best_score_
对学习率调参,同时修改随即森林树的个数为100
结果:({'eta': 0.05263157894736842}, -26.892865169616442)
xgb_param_grid = {'eta':[x for x in np.linspace(0,0.2,20)]}
grid = GridSearchCV(XGBoostRegressor(num_boost_round=200, gamma=0.2, max_depth=4, min_child_weight=6,
colsample_bytree=0.7, subsample=0.6),
param_grid=xgb_param_grid, cv=5, scoring=mae_scorer)
grid.fit(X, y)
grid.best_params_, grid.best_score_
改为200个数,发现有过拟合的现象了:
({'eta': 0.031578947368421054}, -26.912105302160462)
得分不增反降
bst = XGBoostRegressor(num_boost_round=100, eta=0.1, gamma=0.2, max_depth=4, min_child_weight=1,
colsample_bytree=0.7, subsample=0.6)
cv = bst.kfold(X,y, nfold=5)
对100个数的情况做一个交叉验证
cv
train-mae-mean 24.903263
train-mae-std 0.220076
train-rmse-mean 0.653812
train-rmse-std 0.009592
test-mae-mean 26.819689
test-mae-std 0.666679
test-rmse-mean 0.799807
test-rmse-std 0.060250
Name: 199, dtype: float64
模型训练部分就结束了。
data=pd.read_csv('happiness_test_complete.csv',encoding='gbk')
读取测试集数据,并对测试集数据做同样的处理
class_mapping = {label:idx for idx,label in enumerate(set(data['edu_other']))}
data['edu_other'] = data['edu_other'].map(class_mapping)
class_mapping = {label:idx for idx,label in enumerate(set(data['property_other']))}
data['property_other'] = data['property_other'].map(class_mapping)
class_mapping = {label:idx for idx,label in enumerate(set(data['invest_other']))}
data['invest_other'] = data['invest_other'].map(class_mapping)
data['survey_time']=pd.to_datetime(data['survey_time'])
data['survey_time']= data['survey_time'].dt.year
test.head()
data['age']=data['survey_time']-data['birth']
data['edu_age']=data['survey_time']-data['edu_yr']
data['edu_age'].loc[data['edu_age']>100] = np.nan
data['marital_1st_age']=data['survey_time']-data['marital_1st']
data['marital_1st_age'].loc[data['marital_1st_age']>100] = np.nan
data['marital_now_age']=data['survey_time']-data['marital_now']
data['marital_now_age'].loc[data['marital_now_age']>100] = np.nan
data['f_age']=data['survey_time']-data['f_birth']
data['f_age'].loc[data['f_age']>100] = np.nan
data['m_age']=data['survey_time']-data['m_birth']
data['m_age'].loc[data['m_age']>100] = np.nan
data['m_age'].head()
data['edu_age'].head()
data['marital_1st_age'].head()
data['marital_now_age'].head()
data['f_age'].head()
data.head()
data=data.drop('edu_yr',axis=1)
data=data.drop('marital_1st',axis=1)
data=data.drop('marital_now',axis=1)
data=data.drop('f_birth',axis=1)
data=data.drop('m_birth',axis=1)
data=data.drop('survey_time',axis=1)
data=data.drop('birth',axis=1)
同上
best_model=grid.best_estimator_
pred_y=best_model.predict(data)
获取最优参数,对测试集进行预测。
data1=pd.read_csv('happiness_test_complete.csv',encoding='gbk')
df=pd.DataFrame(data1['id'].values,pred_y)
df.to_csv('happiness_submit10.csv')
保存预测值与id为csv文件。
以上就是使用Xgboost对国民幸福指数进行预测,可以看到,我这是保存的第十次文件了,可见我尝试过的调参次数。总而言之,结果还不错。如果把Xgboost和神经网络结合起来效果应该会更好。