no module named xgboost_Xgboost挖掘幸福感

最新推荐文章于 2024-04-15 20:00:00 发布

weixin_39588104

最新推荐文章于 2024-04-15 20:00:00 发布

阅读量403

点赞数

文章标签： no module named xgboost sklearn中的xgboost

幸福感是一个古老而深刻的话题，是人类世代追求的方向。与幸福感相关的因素成千上万、因人而异，大如国计民生，小如路边烤红薯，都会对幸福感产生影响。这些错综复杂的因素中，我们能找到其中的共性，一窥幸福感的要义吗？

今天介绍使用Xgboost的方法来预测幸福感。

运行环境py37、jupyter notebook、anaconda3

import numpy as np

import pandas as pd

导入一下pandas和numpy

test=pd.read_csv('happiness_train_complete.csv',encoding='gbk')

test.describe()

获取数据，描述一下数据集

可以看到有8000组数据，135个特征变量以及1个指标变量。

class_mapping = {label:idx for idx,label in enumerate(set(test['edu_other']))}

test['edu_other'] = test['edu_other'].map(class_mapping)

class_mapping = {label:idx for idx,label in enumerate(set(test['property_other']))}

test['property_other'] = test['property_other'].map(class_mapping)

class_mapping = {label:idx for idx,label in enumerate(set(test['invest_other']))}

test['invest_other'] = test['invest_other'].map(class_mapping)

由于原数据里面有字符型数据，使用map方法替换掉字符型数据

test['survey_time']=pd.to_datetime(test['survey_time'])

test['survey_time']= test['survey_time'].dt.year

test.head()

原数据里面survey_time是年/月/日形式，把它改为按年计算。

test['age']=test['survey_time']-test['birth']

test['edu_age']=test['survey_time']-test['edu_yr']

test['edu_age'].loc[test['edu_age']>100] = np.nan

test['edu_age'].head()

test['marital_1st_age']=test['survey_time']-test['marital_1st']

test['marital_1st_age'].loc[test['marital_1st_age']>100] = np.nan

test['marital_1st_age'].head()

test['marital_now_age']=test['survey_time']-test['marital_now']

test['marital_now_age'].loc[test['marital_now_age']>100] = np.nan

test['marital_now_age'].head()

test['f_age']=test['survey_time']-test['f_birth']

test['f_age'].loc[test['f_age']>100] = np.nan

test['f_age'].head()

test['m_age']=test['survey_time']-test['m_birth']

test['m_age'].loc[test['m_age']>100] = np.nan

test['m_age'].head()

原数据里只给了出生年份，我们用survey_time减去birth得到年龄，由于数据是调查问卷得到的，有些不愿意填写的人使用-1、-2标记，对于这样计算年龄会得到一个较大的值，因此把大于100的值用NAN替换。

test=test.drop('edu_yr',axis=1)

test=test.drop('marital_1st',axis=1)

test=test.drop('marital_now',axis=1)

test=test.drop('f_birth',axis=1)

test=test.drop('m_birth',axis=1)

test=test.drop('birth',axis=1)

把代表年份的数据剔除，因为已经计算了年龄。

X=test.drop('happiness',axis=1)

X=test.drop('survey_time',axis=1)

y=test['happiness']

设置X和y。

from sklearn.preprocessing import StandardScaler

for column in X.columns:

X[column] = StandardScaler().fit_transform(X[column].values.reshape(-1, 1))

对特征向量标准化处理。

import xgboost as xgb

import pickle

import sys

import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error, make_scorer

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

from scipy.sparse import csr_matrix, hstack

from sklearn.model_selection import KFold, train_test_split

from xgboost import XGBRegressor

import warnings

warnings.filterwarnings('ignore')

%matplotlib inline

%config InlineBackend.figure_format = 'retina'

导入Xgboost以及GridSearch和常用的机器学习包

dtrain = xgb.DMatrix(X, y)

xgb_params = {

'seed': 0,

'eta': 0.1,

'colsample_bytree': 0.5,

'silent': 0,

'subsample': 0.5,

'objective': 'reg:linear',

'max_depth': 5,

'min_child_weight': 3

}

%%time

def xg_eval_mae(yhat, dtrain):

Y = dtrain.get_label()

return 'mae', mean_absolute_error(np.exp(Y), np.exp(yhat))

bst_cv1 = xgb.cv(xgb_params, dtrain, num_boost_round=50, nfold=3, seed=0,

feval=xg_eval_mae, maximize=False, early_stopping_rounds=10)

print ('CV score:', bst_cv1.iloc[-1,:]['test-mae-mean'])

使用Xgboost拟合一下X，y，得到交叉验证分数如下：

CV score: 27.177993999999998

plt.figure()

bst_cv1[['train-mae-mean', 'test-mae-mean']].plot()

对训练集和测试集绘图，看一下是否有过拟合线象。

可以看到在树的数量到了20之后，会有少量过拟合。

%%time

bst_cv2 = xgb.cv(xgb_params, dtrain, num_boost_round=100,

nfold=3, seed=0, feval=xg_eval_mae, maximize=False,

early_stopping_rounds=10)

print ('CV score:', bst_cv2.iloc[-1,:]['test-mae-mean'])

对树的个数为100的情况下进行一下交叉验证。

CV score: 27.40789266666667

fig, (ax1, ax2) = plt.subplots(1,2)

fig.set_size_inches(16,4)

ax1.set_title('100 rounds of training')

ax1.set_xlabel('Rounds')

ax1.set_ylabel('Loss')

ax1.grid(True)

ax1.plot(bst_cv2[['train-mae-mean', 'test-mae-mean']])

ax1.legend(['Training Loss', 'Test Loss'])

ax2.set_title('60 last rounds of training')

ax2.set_xlabel('Rounds')

ax2.set_ylabel('Loss')

ax2.grid(True)

ax2.plot(bst_cv2.iloc[40:][['train-mae-mean', 'test-mae-mean']])

ax2.legend(['Training Loss', 'Test Loss'])

绘制一下总体以及后60个树的过拟合图像

class XGBoostRegressor(object):

def __init__(self, **kwargs):

self.params = kwargs

if 'num_boost_round' in self.params:

self.num_boost_round = self.params['num_boost_round']

self.params.update({'silent': 1, 'objective': 'reg:linear', 'seed': 0})

def fit(self, x_train, y_train):

dtrain = xgb.DMatrix(x_train, y_train)

self.bst = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round,

feval=xg_eval_mae, maximize=False)

def predict(self, x_pred):

dpred = xgb.DMatrix(x_pred)

return self.bst.predict(dpred)

def kfold(self, x_train, y_train, nfold=5):

dtrain = xgb.DMatrix(x_train, y_train)

cv_rounds = xgb.cv(params=self.params, dtrain=dtrain, num_boost_round=self.num_boost_round,

nfold=nfold, feval=xg_eval_mae, maximize=False, early_stopping_rounds=10)

return cv_rounds.iloc[-1,:]

def plot_feature_importances(self):

feat_imp = pd.Series(self.bst.get_fscore()).sort_values(ascending=False)

feat_imp.plot(title='Feature Importances')

plt.ylabel('Feature Importance Score')

def get_params(self, deep=True):

return self.params

def set_params(self, **params):

self.params.update(params)

return self

声明一下XgboostRegressor

def mae_score(y_true, y_pred):

return mean_absolute_error(np.exp(y_true), np.exp(y_pred))

mae_scorer = make_scorer(mae_score, greater_is_better=False)

定义一个得分函数。

bst = XGBoostRegressor(eta=0.1, colsample_bytree=0.5, subsample=0.5,

max_depth=5, min_child_weight=3, num_boost_round=50)

bst.kfold(X, y, nfold=5)

对以上参数做一个交叉验证并计算得分：

train-mae-mean 24.681992
train-mae-std 0.214132
train-rmse-mean 0.661918
train-rmse-std 0.010127
test-mae-mean 27.068168
test-mae-std 0.746192
test-rmse-mean 0.807352
test-rmse-std 0.058963
Name: 49, dtype: float64

xgb_param_grid = {'max_depth': list(range(4,9)), 'min_child_weight': list((1,3,6))}

grid = GridSearchCV(XGBoostRegressor(eta=0.1, num_boost_round=50, colsample_bytree=0.5, subsample=0.5),

param_grid=xgb_param_grid, cv=5, scoring=mae_scorer)

grid.fit(X, y)

对max_depth、min_child_weight进行调参，拟合一下数据。

grid.best_params_, grid.best_score_

best_params_返回的是最佳参数，best_score_返回最佳参数得分。

({'max_depth': 4, 'min_child_weight': 6}, -27.189371616740114)

def convert_grid_scores(scores):

_params = []

_params_mae = []

for i in scores:

_params.append(i[0].values())

_params_mae.append(i[1])

params = np.array(_params)

grid_res = np.column_stack((_params,_params_mae))

return [grid_res[:,i] for i in range(grid_res.shape[1])]

scores = grid.cv_results_['mean_test_score']

定义得分函数，规则为交叉验证的平均得分。

xgb_param_grid = {'gamma':[ 0.1 * i for i in range(0,5)]}

grid = GridSearchCV(XGBoostRegressor(eta=0.1, num_boost_round=50, max_depth=4, min_child_weight=6,

colsample_bytree=0.5, subsample=0.5),

param_grid=xgb_param_grid, cv=5, scoring=mae_scorer)

grid.fit(X, y)

grid.best_params_, grid.best_score_

对gamma值调参，返回最佳参数以及得分：

({'gamma': 0.2}, -27.17756873995668)

xgb_param_grid = {'subsample':[ 0.1 * i for i in range(6,9)],

'colsample_bytree':[ 0.1 * i for i in range(6,9)]}

grid = GridSearchCV(XGBoostRegressor(eta=0.1, gamma=0.2, num_boost_round=50, max_depth=4, min_child_weight=6),

param_grid=xgb_param_grid, cv=5, scoring=mae_scorer)

grid.fit(X, y)

grid.best_params_, grid.best_score_

对subsample以及colsample_bytree调参。

({'colsample_bytree': 0.7000000000000001, 'subsample': 0.6000000000000001},
-26.995863945755023)

xgb_param_grid = {'eta':[0.5,0.4,0.3,0.2,0.1,0.075,0.05,0.04,0.03]}

grid = GridSearchCV(XGBoostRegressor(num_boost_round=50, gamma=0.2, max_depth=4, min_child_weight=6,

colsample_bytree=0.7, subsample=0.6),

param_grid=xgb_param_grid, cv=5, scoring=mae_scorer)

grid.fit(X,y)

grid.best_params_, grid.best_score_

对学习率调参。

结果：({'eta': 0.1}, -26.995863945755023)

xgb_param_grid = {'eta':[x for x in np.linspace(0,0.2,20)]}

grid = GridSearchCV(XGBoostRegressor(num_boost_round=100, gamma=0.2, max_depth=4, min_child_weight=6,

colsample_bytree=0.7, subsample=0.6),

param_grid=xgb_param_grid, cv=5, scoring=mae_scorer)

grid.fit(X, y)

grid.best_params_, grid.best_score_

对学习率调参，同时修改随即森林树的个数为100

结果：({'eta': 0.05263157894736842}, -26.892865169616442)

xgb_param_grid = {'eta':[x for x in np.linspace(0,0.2,20)]}

grid = GridSearchCV(XGBoostRegressor(num_boost_round=200, gamma=0.2, max_depth=4, min_child_weight=6,

colsample_bytree=0.7, subsample=0.6),

param_grid=xgb_param_grid, cv=5, scoring=mae_scorer)

grid.fit(X, y)

grid.best_params_, grid.best_score_

改为200个数，发现有过拟合的现象了：

({'eta': 0.031578947368421054}, -26.912105302160462)

得分不增反降

bst = XGBoostRegressor(num_boost_round=100, eta=0.1, gamma=0.2, max_depth=4, min_child_weight=1,

colsample_bytree=0.7, subsample=0.6)

cv = bst.kfold(X,y, nfold=5)

对100个数的情况做一个交叉验证

train-mae-mean 24.903263
train-mae-std 0.220076
train-rmse-mean 0.653812
train-rmse-std 0.009592
test-mae-mean 26.819689
test-mae-std 0.666679
test-rmse-mean 0.799807
test-rmse-std 0.060250
Name: 199, dtype: float64

模型训练部分就结束了。

data=pd.read_csv('happiness_test_complete.csv',encoding='gbk')

读取测试集数据，并对测试集数据做同样的处理

class_mapping = {label:idx for idx,label in enumerate(set(data['edu_other']))}

data['edu_other'] = data['edu_other'].map(class_mapping)

class_mapping = {label:idx for idx,label in enumerate(set(data['property_other']))}

data['property_other'] = data['property_other'].map(class_mapping)

class_mapping = {label:idx for idx,label in enumerate(set(data['invest_other']))}

data['invest_other'] = data['invest_other'].map(class_mapping)

data['survey_time']=pd.to_datetime(data['survey_time'])

data['survey_time']= data['survey_time'].dt.year

test.head()