2021年全国大学生数据统计与分析竞赛-B题分析

2021年全国大学生数据统计与分析竞赛的B题代码精简版

模块加载

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import warnings 
import xgboost as xgb
from xgboost import XGBClassifier
from xgboost import plot_importance
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_curve,roc_auc_score
from sklearn.model_selection import train_test_splitGridSearchCV,cross_val_score

warnings.filterwarnings('ignore')
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False

数据读取

#读取经过预处理的数据
df = pd.read_csv('修正数据1.csv')

特征重要性

#选取重要的10个特征
new_data_0 = df[['coupon',
 'distance_day',
 'coupon_visit',
 'study_num',
 'course_order_num',
 'login_diff_time',
 'chinese_subscribe_num',
 'learn_num',
 'platform_num',
 'first_order_price','result']]

数据切分


#切分数据
dataset = new_data_0
X = dataset.iloc[:,:10]
y = dataset.iloc[:,10]

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.33,random_state=7,stratify=y)

模型建立

#初始
model = XGBClassifier(max_depth=6,  #初始值4
                     learning_rate=0.1,
                     n_estimators = 151,   #初始值200
                     objective='binary:logistic',
                     booster='gbtree',
                    reg_lambda = 0.3,      #初始值1
                      reg_alpha = 0.01,     #初始值0
                     gamma=0.7,          #初始值0
                    eval_metric = 'error',
                     min_child_weight=1,
                     subsample=0.8,           # 初始值0.5
                     colsample_bytree=0.5,  #初始值1 
                     seed=7)

调参


dtrain = xgb.DMatrix(X_train,label=y_train)
#使用交叉验证cv找到最佳的迭代次数(决策树的数量)
cv_result = xgb.cv(model.get_xgb_params(),  #352
                   dtrain,
                   num_boost_round=353,
                   nfold=6,
                   metrics='auc',
                   early_stopping_rounds=20,
                   callbacks=[xgb.callback.early_stop(20),
                              xgb.callback.print_evaluation(period=1,show_stdv=True)])

#绘制n_estimators-精度图
plt.figure(dpi=100,figsize=(5,3))
plt.plot(cv_result['test-auc-mean'][:151])
plt.xlabel('n_estimators')
plt.ylabel('精度')
plt.show()

在这里插入图片描述

## max_depth、min_child_weight
param_grid = {'max_depth':[3,4,5,6,7,8,9],
             'min_child_weight':[1,2,3,4]}   #'gamma':[1,2,3,4,5,6,7,8,9]

grid_search = GridSearchCV(model,param_grid,scoring='roc_auc',iid=False,cv=5)

grid_search.fit(X_train,y_train)

print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)
## gamma
param_grid = {'gamma':[i/10.0 for i in range(0,11)] }

grid_search = GridSearchCV(model,param_grid,scoring='roc_auc',iid=False,cv=5)

grid_search.fit(X_train,y_train)

print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)
##subsample、colsample_bytree
param_grid = {'subsample':[i/10.0 for i in range(5,10)],  #5
             'colsample_bytree':[i/10.0 for i in range(5,10)]
             }

grid_search = GridSearchCV(model,param_grid,scoring='roc_auc',iid=False,cv=5)

grid_search.fit(X_train,y_train)

print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)
##reg_alpha、reg_lambda
#'reg_alpha':[0, 0.001, 0.005, 0.01, 0.05]
param_grid = {'reg_alpha':[0,  0.005, 0.01, 0.05],
              'reg_lambda':[0, 0.3, 0.5,  1]
             }

grid_search = GridSearchCV(model,param_grid,scoring='roc_auc',iid=False,cv=5)

grid_search.fit(X_train,y_train)

print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)
#最后再调节学习率
param_grid = {'learning_rate': [0.01, 0.05, 0.07, 0.1, 0.12,0.15]
             }

grid_search = GridSearchCV(model,param_grid,scoring='roc_auc',iid=False,cv=5)

grid_search.fit(X_train,y_train)

print('best_params:',grid_search.best_params_)
print('best_score:',grid_search.best_score_)

交叉验证

#最优参数
from sklearn.model_selection import KFold
splits = KFold(n_splits=5, shuffle=True, random_state=20211)

parameters = {'max_depth':6,
                     'learning_rate':0.01,
                     'objective':'binary:logistic',
                     'booster':'gbtree',
                     'n_jobs':4,
                      'reg_alpha': 0.01,
                      'reg_lambda' : 0.3,
                     'gamma':0.7,
                      'eval_metric' : 'error',
                     'min_child_weight':1,
                     'subsample':0.8,
                     'colsample_bytree':0.5,
                     'seed':7 }

predicted_train_xgb = np.zeros(len(X_train))
predicted_test_xgb = np.zeros(len(X_test))

for fold_, (trn_idx, val_idx) in enumerate(splits.split(X_train, y_train)):
    print("fold {}".format(fold_+1))
  #  print(list(trn_idx))
    trn_data = xgb.DMatrix(X_train.iloc[trn_idx], y_train.iloc[trn_idx])
    val_data = xgb.DMatrix(X_train.iloc[val_idx], y_train.iloc[val_idx])
    
    watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
    
    clf = xgb.train(dtrain=trn_data, 
                    num_boost_round=5000, 
                    evals=watchlist, 
                    early_stopping_rounds=200, 
                    verbose_eval=100, 
                    params=parameters,
                    )
    
    predicted_train_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train.iloc[val_idx]), ntree_limit=clf.best_ntree_limit)
    predicted_test_xgb += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / splits.n_splits
    

模型评估

#混淆矩阵
#bool ->  0-1
pre_data = (predicted_test_xgb >=0.40) +0
#打印评估报告
print(classification_report(pre_data, y_test))

在这里插入图片描述
注:实际比赛中还使用了特征工程、模型融合等。

总结

这次比赛是大学生数据分析领域一次比较难得的比赛,由于是第一届,参赛的人不多,知名度也没啥,但是这样完整的做下来,还是感觉收获挺多的,首先是数据的预处理,这里就没有展示了,但是这部分可以说是最重要的,每个字段的数据要根据字段的具体含义去判断,还要联系几个字段之间的关系。例如在调查报告中,给出一个人的驾驶时长、调查时间、出生日期,很明显通过调查时间和出生日期可以计算年龄,年龄肯定大于驾龄+18,不然就算是异常的,可能数据中没有这种异常出现,但是也是值得分析的!再就是模型调参了,这部分的提升不算明显,但是这样能使得建模更加完整!特征工程的话,简单来说就是对已有字段(特征)进行挖掘,可以添加某个字段(特征)均值方差作为新的字段(特征),也可以是几个特征的组合等等,模型都是次要的,因为模型的结构就那样了,每个人的模型都差不多,但是每个人的特征工程做的可能都不一样。

  • 7
    点赞
  • 36
    收藏
    觉得还不错? 一键收藏
  • 9
    评论
评论 9
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值