训练集
经过上一个步骤之后,我们可以得到训练的特征数据集,这里一共有两个数据集,按日期分的,接下来利用训练集里得到模型。
XGBoost模型
这里讲解利用XGBoost模型来训练模型,首先需要在python中安装XGBoost,安装步骤如下:
- 1、anaconda search -t conda xgboost
- 2、conda install -c anaconda py-xgboost
第一个命令是搜索xgboost库,第二个命令是安装搜索到的py-xgboost模型。安装好之后,开始训练模型。 - 1、引包
-
from sklearn.metrics import confusion_matrix, roc_auc_score import numpy as np import xgboost import pandas as pd import matplotlib.pyplot as plt import time import os import sys import json import operator
- 2、文件路径
-
save_stdout = sys.stdout fileDir='D:/workspace/gitWorkSpace/O2O-Coupon-Usage-Forecast-master/O2O-Coupon-Usage-Forecast-master/code/wepon/data1/' # 特征 train_path=fileDir+'data/dataset1.csv' validate_path=fileDir+'data/dataset2.csv' predict_path=fileDir+'data/dataset3.csv' model_path=fileDir+'model/trainModel' model_fmap_file = 'model.fmap' # model path model_file ='.model' model_dump_file = 'model_dump.txt' model_feature_importance_file = 'feature_importance.png' model_feature_importance_csv = 'feature_importance.csv' model_train_log = 'train.log' model_params = '/param.json' val_diff_file ='val_diff.csv' # submission path submission_path = fileDir+'submission' submission_hist_file = 'hist.png' submission_file = 'submission.csv' # raw field name user_label = 'User_id' merchant_label = 'Merchant_id' coupon_label = 'Coupon_id' action_label = 'Action' discount_label = 'Discount_rate' distance_label = 'Distance' date_received_label = 'Date_received' date_consumed_label = 'Date' probability_consumed_label = 'Probability' column=["user_id","discount_rate","distance","day_of_month","days_distance","discount_man","discount_jian","is_man_jian","total_sales","sales_use_coupon","total_coupon","merchant_min_distance","merchant_max_distance","merchant_mean_distance","merchant_median_distance","merchant_coupon_transfer_rate","coupon_rate","count_merchant","user_min_distance","user_max_distance","user_mean_distance","user_median_distance","buy_use_coupon","buy_total","coupon_received","avg_user_date_datereceived_gap","min_user_date_datereceived_gap","max_user_date_datereceived_gap","buy_use_coupon_rate","user_coupon_transfer_rate","user_merchant_buy_total","user_merchant_received","user_merchant_buy_use_coupon","user_merchant_any","user_merchant_buy_common","user_merchant_coupon_transfer_rate","user_merchant_coupon_buy_rate","user_merchant_rate","user_merchant_common_buy_rate","this_month_user_receive_same_coupon_count","this_month_user_receive_all_coupon_count","this_month_user_receive_same_coupon_lastone","this_month_user_receive_same_coupon_firstone","this_day_user_receive_all_coupon_count","this_day_user_receive_same_coupon_count","day_gap_before","day_gap_after","is_weekend","weekday1","weekday2","weekday3","weekday4","weekday5","weekday6","weekday7","label"]
3、函数方法
def calc_auc(df): coupon = df[coupon_label].iloc[0] y_true = df['label'].values if len(np.unique(y_true)) != 2: auc = np.nan else: y_pred = df[probability_consumed_label].values auc = roc_auc_score(np.array(y_true), np.array(y_pred)) return pd.DataFrame({coupon_label: [coupon], 'auc': [auc]}) def check_average_auc(df): grouped = df.groupby(coupon_label, as_index=False).apply(lambda x: calc_auc(x)) return grouped['auc'].mean(skipna=True) def create_feature_map(features, fmap): outfile = open(fmap, 'w') for i, feat in enumerate(features): outfile.write('{0}\t{1}\tq\n'.format(i, feat)) outfile.close() def train(param, num_round=1000, early_stopping_rounds=20): exec_time = time.strftime("%Y%m%d%I%p%M", time.localtime()) os.mkdir('{0}_{1}'.format(model_path, exec_time)) os.mkdir('{0}_{1}'.format(submission_path, exec_time)) train_params = param.copy() train_params['num_boost_round'] = num_round train_params['early_stopping_rounds'] = early_stopping_rounds # 使用json.dumps()方法来将一个Python数据类型列表编码成json格式的字符串。 # 这里暂时报错,先注释掉 #json.dump(train_params, open('{0}_{1}{2}'.format(model_path, exec_time, model_params), 'wb+')) print('get training data') train_features = pd.read_csv(train_path).astype(float) train_features.columns = column # 获取所有数据,1、得到标签 train_labels = train_features.label # 2、去掉标签 train_features = train_features.drop(['label'],axis=1) # 交叉验证集 validate_features = pd.read_csv(validate_path).astype(float) validate_features.columns = column validate_labels = validate_features.label validate_features = validate_features.drop(['label'],axis=1) # 测试集 predict_features = pd.read_csv(predict_path).astype(float) # 生成一个文件 create_feature_map(train_features.columns.tolist(), '{0}_{1}{2}'.format(model_path, exec_time, model_fmap_file)) # XGBoost可以加载libsvm格式的文本数据,加载的数据格式可以为Numpy的二维数组和XGBoost的二进制的缓存文件。加载的数据存储在对象DMatrix中。 train_matrix = xgboost.DMatrix(train_features.values, label=train_labels.values, feature_names=train_features.columns) val_matrix = xgboost.DMatrix(validate_features.values, label=validate_labels.values, feature_names=validate_features.columns) predict_matrix = xgboost.DMatrix(predict_features.values, feature_names=predict_features.columns) watchlist = [(train_matrix, 'train'), (val_matrix, 'eval')] print ('model training') # with open('{0}_{1}{2}'.format(model_path, exec_time, model_train_log), 'wb+') as outf: # sys.stdout = outf # model = xgboost.train(param, train_matrix, num_boost_round=num_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds) model = xgboost.train(param, train_matrix, num_boost_round=num_round, evals=watchlist, early_stopping_rounds=early_stopping_rounds) sys.stdout = save_stdout print('model.best_score: {0}, model.best_iteration: {1}, model.best_ntree_limit: {2}'.format(model.best_score, model.best_iteration, model.best_ntree_limit)) print ('output offline model data') model.save_model('{0}_{1}{2}'.format(model_path, exec_time, model_file)) model.dump_model('{0}_{1}{2}'.format(model_path, exec_time, model_dump_file)) importance = model.get_fscore(fmap='{0}_{1}{2}'.format(model_path, exec_time, model_fmap_file)) importance = sorted(importance.items(), key=operator.itemgetter(1)) df = pd.DataFrame(importance, columns=['feature', 'fscore']) df['fscore'] = df['fscore'] / df['fscore'].sum() df.to_csv('{0}_{1}{2}'.format(model_path, exec_time, model_feature_importance_csv), index=False) xgboost.plot_importance(model) plt.gcf().set_size_inches(20, 16) plt.gcf().set_tight_layout(True) plt.gcf().savefig('{0}_{1}{2}'.format(model_path, exec_time, model_feature_importance_file)) plt.close() train_pred_labels = model.predict(train_matrix, ntree_limit=model.best_ntree_limit) val_pred_labels = model.predict(val_matrix, ntree_limit=model.best_ntree_limit) train_pred_frame = pd.Series(train_pred_labels, index=train_features.index) train_pred_frame.name = probability_consumed_label val_pred_frame = pd.Series(val_pred_labels, index=validate_features.index) val_pred_frame.name = probability_consumed_label ## data_split_path= 'D:/workspace/gitWorkSpace/O2O-Coupon-Usage-Forecast-master/O2O-Coupon-Usage-Forecast-master/code/charles/data_split' train_true_frame = pd.read_csv( train_path )['label'] val_true_frame = pd.read_csv( validate_path )['label'] train_coupons = pd.read_csv( data_split_path+"/train_data/dataset.csv" ) # 这个地方数据要分片,切分数据 val_coupons = pd.read_csv( data_split_path+"/validate_data/dataset.csv" ) # 切分的原始数据,根据日期切分的数据 train_check_matrix = train_coupons[[coupon_label]].join(train_true_frame).join(train_pred_frame) val_check_matrix = val_coupons[[coupon_label]].join(val_true_frame).join(val_pred_frame) print('Average auc of train matrix: ', check_average_auc(train_check_matrix)) print('Average auc of validate matrix', check_average_auc(val_check_matrix)) val_coupons = val_coupons.join(val_pred_frame).join(val_pred_frame.map(lambda x: 0. if x < 0.5 else 1.).rename('map')).join(val_true_frame) val_coupons.to_csv('{0}_{1}{2}'.format(model_path, exec_time, val_diff_file), index=False) print(confusion_matrix(val_coupons['label'], val_coupons['map'])) # 预测模型 labels = model.predict(predict_matrix, ntree_limit=model.best_ntree_limit) frame = pd.Series(labels, index=predict_features.index) frame.name = probability_consumed_label plt.figure() frame.hist(figsize=(10, 8)) plt.title('results histogram') plt.xlabel('predict probability') plt.gcf().savefig('{0}_{1}{2}'.format(submission_path, exec_time, submission_hist_file)) plt.close() submission = pd.read_csv(data_split_path+"/predict_data/dataset.csv") submission = submission[[user_label, coupon_label, date_received_label]].join(frame) # 把核销概率拼接在一起 submission.to_csv('{0}_{1}{2}'.format(submission_path, exec_time, submission_file), index=False)
- 4、主函数
-
if __name__ == '__main__': # 调参 参数说明 # param = dict() # param['objective'] = 'binary:logistic' # param['eval_metric'] = 'auc' # param['silent'] = 1 # param['scale_pos_weight'] = 10 # all_etas = [0.01, 0.05, 0.1, 0.15, 0.2] # all_subsamples = [0.6, 0.8, 1.0] # all_colsample_bytree = [0.6, 0.8, 1.0] # all_depth = [6, 7, 8, 9] # all_child_weights = [1, 10, 20, 50] # all_gamma = [0, 5, 20, 50] # for e, s, cb, d, cw, g in list(itertools.product(all_etas, all_subsamples, all_colsample_bytree, all_depth, all_child_weights, all_gamma)): # param['eta'] = e # param['subsample'] = s # param['colsample_bytree'] = cb # param['max_depth'] = d # param['min_child_weight'] = cw # param['gamma'] = g # train(param) init_param = { 'max_depth': 8, 'eta': 0.1, 'silent': 1, 'seed': 13, 'objective': 'binary:logistic', 'eval_metric': 'auc', 'scale_pos_weight': 2, 'subsample': 0.8, 'colsample_bytree': 0.7, 'min_child_weight': 100, 'max_delta_step': 20 } train(init_param, num_round=1000, early_stopping_rounds=50) print("结束")
用两份数据集训练模型,并分别用来预测另一个训练集,计算其准确率,训练完模型保存了起来,最后我们会用训练得到的模型来预测测试集数据。