日萌社
人工智能AI:Keras PyTorch MXNet TensorFlow PaddlePaddle 深度学习实战(不定时更新)
阿里云官网:天池新人实战赛o2o优惠券使用预测
数据集下载链接:https://pan.baidu.com/s/13OtaUv6j4x8dD7cgD4sL5g
提取码:7tze
Sklearn:天池新人实战赛o2o优惠券使用预测 part1
Sklearn:天池新人实战赛o2o优惠券使用预测 part2
Sklearn:天池新人实战赛o2o优惠券使用预测 part3
新特征工程XGboost_cv调优
In [1]:
import pandas as pd
import numpy as np
from xgboost.sklearn import XGBClassifier
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
import datetime
import os
获取数据
缺失值处理
In [2]:
def get_processed_data():
dataset1 = pd.read_csv('data_preprocessed_2/ProcessDataSet1.csv')
dataset2 = pd.read_csv('data_preprocessed_2/ProcessDataSet2.csv')
dataset3 = pd.read_csv('data_preprocessed_2/ProcessDataSet3.csv')
dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset3.drop_duplicates(inplace=True)
dataset12 = pd.concat([dataset1, dataset2], axis=0)
dataset12.fillna(0, inplace=True)
dataset3.fillna(0, inplace=True)
return dataset12, dataset3
模型训练
训练集的处理
In [3]:
def train_xgb(dataset12, dataset3):
predict_dataset = dataset3[['User_id', 'Coupon_id', 'Date_received']].copy()
predict_dataset.Date_received = pd.to_datetime(predict_dataset.Date_received, format='%Y-%m-%d')
predict_dataset.Date_received = predict_dataset.Date_received.dt.strftime('%Y%m%d')
# 将数据转化为dmatric格式
dataset12_x = dataset12.drop(
columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
'Date', 'Coupon_id', 'label'], axis=1)
dataset3_x = dataset3.drop(
columns=['User_id', 'Merchant_id', 'Discount_rate', 'Date_received', 'discount_rate_x', 'discount_rate_y',
'Coupon_id'], axis=1)
train_dmatrix = xgb.DMatrix(dataset12_x, label=dataset12.label)
predict_dmatrix = xgb.DMatrix(dataset3_x)
params = {'booster': 'gbtree',
'objective': 'binary:logistic',
'eval_metric': 'auc',
'gamma': 0.1,
'min_child_weight': 1.1,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.01,
'tree_method': 'gpu_hist',
'seed': 0,
'nthread': cpu_jobs,
'predictor': 'cpu_predictor'
}
# 使用xgb.cv优化num_boost_round参数
cvresult = xgb.cv(params, train_dmatrix, num_boost_round=10000, nfold=2, metrics='auc', seed=0, callbacks=[
xgb.callback.print_evaluation(show_stdv=False),
xgb.callback.early_stop(50)
])
num_round_best = cvresult.shape[0] - 1
print('Best round num: ', num_round_best)
# 使用优化后的num_boost_round参数训练模型
watchlist = [(train_dmatrix, 'train')]
model = xgb.train(params, train_dmatrix, num_boost_round=num_round_best, evals=watchlist)
model.save_model('train_dir_2/xgbmodel_cv_new')
params['predictor'] = 'cpu_predictor'
model_cv = xgb.Booster(params)
model_cv.load_model('train_dir_2/xgbmodel_cv_new')
# predict test set
dataset3_predict = predict_dataset.copy()
dataset3_predict['label'] = model_cv.predict(predict_dmatrix)
# 标签归一化
dataset3_predict.label = MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(
dataset3_predict.label.values.reshape(-1, 1))
dataset3_predict.sort_values(by=['Coupon_id', 'label'], inplace=True)
dataset3_predict.to_csv("train_dir_2/xgb_cv_preds.csv", index=None, header=None)
print(dataset3_predict.describe())
# 在dataset12上计算auc
# model = xgb.Booster()
# model.load_model('train_dir_2/xgbmodel')
temp = dataset12[['Coupon_id', 'label']].copy()
temp['pred'] = model.predict(xgb.DMatrix(dataset12_x))
temp.pred = MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(temp['pred'].values.reshape(-1, 1))
print(myauc(temp))
性能评价函数
In [4]:
# 性能评价函数
def myauc(test):
testgroup = test.groupby(['Coupon_id'])
aucs = []
for i in testgroup:
tmpdf = i[1]
if len(tmpdf['label'].unique()) != 2:
continue
fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred'], pos_label=1)
aucs.append(auc(fpr, tpr))
return np.average(aucs)
模型训练并保存-0.7983
In [ ]:
# 获取数据
dataset12, dataset3 = get_processed_data()
In [6]:
dataset12.head()
Out[6]:
User_id | Merchant_id | Coupon_id | Discount_rate | Distance | Date_received | Date | discount_rate_x | discount_rate_y | discount_rate | ... | on_u4 | on_u5 | on_u6 | on_u7 | on_u8 | on_u9 | on_u10 | on_u11 | on_u12 | on_u13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1832624 | 3381 | 7610 | 200:20 | 0 | 2016-04-29 | 1970-01-01 | 200.0 | 20.0 | 0.900000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1 | 163606 | 1569 | 5054 | 200:30 | 10 | 2016-04-21 | 1970-01-01 | 200.0 | 30.0 | 0.850000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
2 | 1113008 | 1361 | 11166 | 20:1 | 0 | 2016-05-15 | 2016-05-21 | 20.0 | 1.0 | 0.950000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
3 | 4061024 | 3381 | 7610 | 200:20 | 10 | 2016-04-26 | 1970-01-01 | 200.0 | 20.0 | 0.900000 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
4 | 106443 | 450 | 3732 | 30:5 | 11 | 2016-04-29 | 1970-01-01 | 30.0 | 5.0 | 0.833333 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 124 columns
In [8]:
dataset12.shape
Out[8]:
(383386, 124)
In [9]:
dataset12.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 383386 entries, 0 to 252585
Columns: 124 entries, User_id to on_u13
dtypes: float64(96), int64(25), object(3)
memory usage: 375.6+ MB
In [11]:
dataset12.describe()
Out[11]:
User_id | Merchant_id | Coupon_id | Distance | discount_rate_x | discount_rate_y | discount_rate | label | weekday | day | ... | on_u4 | on_u5 | on_u6 | on_u7 | on_u8 | on_u9 | on_u10 | on_u11 | on_u12 | on_u13 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3.833860e+05 | 383386.000000 | 383386.000000 | 383386.000000 | 383386.000000 | 383386.000000 | 383386.000000 | 383386.000000 | 383386.000000 | 383386.000000 | ... | 383386.000000 | 383386.000000 | 383386.000000 | 383386.000000 | 383386.0 | 383386.000000 | 383386.000000 | 383386.0 | 383386.000000 | 383386.000000 |
mean | 3.683603e+06 | 3653.920730 | 6287.911630 | 3.216938 | 58.047151 | 8.606796 | 0.838597 | 0.082546 | 3.086717 | 17.428612 | ... | 0.538374 | 0.165142 | 0.078388 | 0.019834 | 0.0 | 0.078388 | 0.041178 | 0.0 | 0.001983 | 0.001983 |
std | 2.123219e+06 | 2577.836469 | 3938.971496 | 4.154925 | 59.771475 | 8.860719 | 0.092783 | 0.275195 | 1.984455 | 8.455349 | ... | 1.979329 | 0.355102 | 0.502537 | 0.119940 | 0.0 | 0.502537 | 0.198702 | 0.0 | 0.032386 | 0.032386 |
min | 4.000000e+00 | 2.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0.333333 | 0.000000 | 0.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
25% | 1.843305e+06 | 1244.000000 | 2418.000000 | 0.000000 | 20.000000 | 5.000000 | 0.800000 | 0.000000 | 1.000000 | 11.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
50% | 3.685242e+06 | 3381.000000 | 5584.000000 | 1.000000 | 30.000000 | 5.000000 | 0.833333 | 0.000000 | 3.000000 | 19.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
75% | 5.522414e+06 | 5803.000000 | 9566.000000 | 6.000000 | 100.000000 | 10.000000 | 0.900000 | 0.000000 | 5.000000 | 24.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.000000 | 0.000000 |
max | 7.360961e+06 | 8856.000000 | 14045.000000 | 11.000000 | 300.000000 | 100.000000 | 0.990000 | 1.000000 | 6.000000 | 31.000000 | ... | 319.000000 | 1.000000 | 64.000000 | 1.000000 | 0.0 | 64.000000 | 1.000000 | 0.0 | 0.985915 | 0.985915 |
8 rows × 121 columns
In [12]:
print([column for column in dataset12])
['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance', 'Date_received', 'Date', 'discount_rate_x', 'discount_rate_y', 'discount_rate', 'label', 'weekday', 'day', 'u2', 'u3', 'u19', 'u1', 'u4', 'u5', 'u25', 'u20', 'u6', 'u7', 'u8', 'u9', 'u10', 'u11', 'u21', 'u22', 'u23', 'u24', 'u45', 'u27', 'u28', 'u32', 'u47', 'u33', 'u34', 'u35', 'u36', 'u37', 'discount_type', 'u41', 'u42', 'u43', 'u44', 'u48', 'u49', 'm0', 'm1', 'm2', 'm3', 'm4', 'm7', 'm5', 'm6', 'm8', 'm9', 'm10', 'm11', 'm12', 'm13', 'm14', 'm15', 'm18', 'm19', 'm20', 'm21', 'm22', 'm23', 'c1', 'c2', 'c3', 'c4', 'c5', 'c6', 'c8', 'c9', 'c10', 'c11', 'c12', 'um1', 'um2', 'um3', 'um4', 'um5', 'um6', 'um7', 'um8', 'um9', 'um10', 'um11', 'um12', 'o1', 'o2', 'o17', 'o18', 'o3', 'o4', 'o5', 'o6', 'o7', 'o8', 'o9', 'o10', 'o11', 'o12', 'o13', 'o14', 'o15', 'o16', 'on_u1', 'on_u2', 'on_u3', 'on_u4', 'on_u5', 'on_u6', 'on_u7', 'on_u8', 'on_u9', 'on_u10', 'on_u11', 'on_u12', 'on_u13']
In [10]:
dataset3.shape
Out[10]:
(112803, 122)
In [5]:
start = datetime.datetime.now()
print(start.strftime('%Y-%m-%d %H:%M:%S'))
cpu_jobs = os.cpu_count() - 1
date_null = pd.to_datetime('1970-01-01', format='%Y-%m-%d')
dataset12, dataset3 = get_processed_data()
train_xgb(dataset12, dataset3)
print(datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print('time costed is: %s s' % (datetime.datetime.now() - start).seconds)
2020-03-06 13:07:29
[0] train-auc:0.82972 test-auc:0.82835
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.
Will train until test-auc hasn't improved in 50 rounds.
[1] train-auc:0.83615 test-auc:0.83513
[2] train-auc:0.84745 test-auc:0.84609
[3] train-auc:0.85015 test-auc:0.84850
[4] train-auc:0.85415 test-auc:0.85298
[5] train-auc:0.85444 test-auc:0.85324
[6] train-auc:0.85859 test-auc:0.85735
[7] train-auc:0.86065 test-auc:0.85951
[8] train-auc:0.86068 test-auc:0.85961
[9] train-auc:0.86198 test-auc:0.86090
[10] train-auc:0.86290 test-auc:0.86175
。。。。。。。。。。。。
[6190] train-auc:0.93079 test-auc:0.90374
[6191] train-auc:0.93079 test-auc:0.90374
[6192] train-auc:0.93079 test-auc:0.90374
[6193] train-auc:0.93080 test-auc:0.90374
[6194] train-auc:0.93080 test-auc:0.90374
[6195] train-auc:0.93081 test-auc:0.90374
[6196] train-auc:0.93081 test-auc:0.90374
[6197] train-auc:0.93081 test-auc:0.90374
[6198] train-auc:0.93082 test-auc:0.90374
[6199] train-auc:0.93082 test-auc:0.90374
[6200] train-auc:0.93083 test-auc:0.90375
[6201] train-auc:0.93083 test-auc:0.90375
[6202] train-auc:0.93083 test-auc:0.90375
[6203] train-auc:0.93084 test-auc:0.90375
[6204] train-auc:0.93084 test-auc:0.90375
[6205] train-auc:0.93084 test-auc:0.90375
[6206] train-auc:0.93085 test-auc:0.90375
[6207] train-auc:0.93085 test-auc:0.90375
Stopping. Best iteration:
[6157] train-auc:0.93068+0.00010 test-auc:0.90375+0.00058
Best round num: 6157
[0] train-auc:0.84011
[1] train-auc:0.84200
[2] train-auc:0.85052
[3] train-auc:0.85268
[4] train-auc:0.85981
[5] train-auc:0.85955
[6] train-auc:0.86228
[7] train-auc:0.86348
[8] train-auc:0.86390
[9] train-auc:0.86610
[10] train-auc:0.86655
。。。。。。。。。。。。
[6120] train-auc:0.92180
[6121] train-auc:0.92180
[6122] train-auc:0.92180
[6123] train-auc:0.92181
[6124] train-auc:0.92181
[6125] train-auc:0.92181
[6126] train-auc:0.92181
[6127] train-auc:0.92181
[6128] train-auc:0.92182
[6129] train-auc:0.92182
[6130] train-auc:0.92182
[6131] train-auc:0.92182
[6132] train-auc:0.92183
[6133] train-auc:0.92183
[6134] train-auc:0.92183
[6135] train-auc:0.92183
[6136] train-auc:0.92184
[6137] train-auc:0.92184
[6138] train-auc:0.92184
[6139] train-auc:0.92184
[6140] train-auc:0.92185
[6141] train-auc:0.92185
[6142] train-auc:0.92185
[6143] train-auc:0.92185
[6144] train-auc:0.92186
[6145] train-auc:0.92186
[6146] train-auc:0.92186
[6147] train-auc:0.92187
[6148] train-auc:0.92187
[6149] train-auc:0.92187
[6150] train-auc:0.92187
[6151] train-auc:0.92188
[6152] train-auc:0.92188
[6153] train-auc:0.92188
[6154] train-auc:0.92188
[6155] train-auc:0.92189
[6156] train-auc:0.92189
User_id Coupon_id label
count 1.128030e+05 112803.000000 112803.000000
mean 3.684618e+06 9064.658006 0.085931
std 2.126358e+06 4147.283515 0.165224
min 2.090000e+02 3.000000 0.000000
25% 1.843824e+06 5035.000000 0.009209
50% 3.683073e+06 9983.000000 0.025507
75% 5.525176e+06 13602.000000 0.064142
max 7.361024e+06 14045.000000 1.000000
0.8090085946857051
2020-03-06 15:02:05
time costed is: 6875 s
xgboot_demo
In [4]:
import pandas as pd
import numpy as np
import pickle
import xgboost as xgb
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.model_selection import train_test_split
读取处理后的特征值
In [5]:
dataset1 = pd.read_csv('./GenerateData1.csv')
dataset2 = pd.read_csv('./GenerateData2.csv')
dataset3 = pd.read_csv('./GenerateData3.csv')
替换为0 的值
In [6]:
dataset1.label.replace(-1, 0, inplace=True)
dataset2.label.replace(-1, 0, inplace=True)
去重合并表格并删除不需要得表格
In [7]:
dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset12 = pd.concat([dataset1, dataset2], axis=0)
dataset12_y = dataset12.label
dataset12_x = dataset12.drop(['user_id', 'label', 'day_gap_before', 'coupon_id', 'day_gap_after'], axis=1)
In [8]:
dataset3.drop_duplicates(inplace=True)
dataset3_preds = dataset3[['user_id', 'coupon_id', 'date_received']]
dataset3_x = dataset3.drop(['user_id', 'coupon_id', 'date_received', 'day_gap_before', 'day_gap_after'], axis=1)
# 获取数据
dataTrain = xgb.DMatrix(dataset12_x, label=dataset12_y)
dataTest = xgb.DMatrix(dataset3_x)
In [9]:
def myauc(test):
testgroup = test.groupby(['coupon_id'])
aucs = []
for i in testgroup:
tmpdf = i[1]
if len(tmpdf['label'].unique()) != 2:
continue
fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred'], pos_label=1)
aucs.append(auc(fpr, tpr))
return np.average(aucs)
xgboot模型 score:0.7885
In [11]:
params = {'booster': 'gbtree',
'objective': 'rank:pairwise',
'eval_metric': 'auc',
'gamma': 0.1,
'min_child_weight': 1.1,
'max_depth': 5,
'lambda': 10,
'subsample': 0.7,
'colsample_bytree': 0.7,
'colsample_bylevel': 0.7,
'eta': 0.01,
'tree_method': 'exact',
'seed': 0,
'nthread': 12
}
In [12]:
watchlist = [(dataTrain, 'train')]
model = xgb.train(params, dataTrain, num_boost_round=3500, evals=watchlist)
[0] train-auc:0.84293
[1] train-auc:0.84883
[2] train-auc:0.85255
[3] train-auc:0.85333
[4] train-auc:0.85568
[5] train-auc:0.85745
[6] train-auc:0.85878
[7] train-auc:0.85870
[8] train-auc:0.85880
[9] train-auc:0.85984
[10] train-auc:0.85987
。。。。。。。。。
[3487] train-auc:0.90753
[3488] train-auc:0.90754
[3489] train-auc:0.90754
[3490] train-auc:0.90755
[3491] train-auc:0.90755
[3492] train-auc:0.90755
[3493] train-auc:0.90755
[3494] train-auc:0.90756
[3495] train-auc:0.90756
[3496] train-auc:0.90756
[3497] train-auc:0.90757
[3498] train-auc:0.90757
[3499] train-auc:0.90757
In [13]:
model.save_model('./xgbmodel')
In [12]:
model = xgb.Booster(params)
In [13]:
model.load_model('./xgbmodel')
In [17]:
dataset3_preds1 = dataset3_preds.copy()
dataset3_preds1['label'] = model.predict(dataTest)
In [19]:
dataset3_preds1.label.head()
Out[19]:
0 -1.927854
1 0.834743
2 -2.466245
3 -1.992080
4 -0.544283
Name: label, dtype: float32
In [21]:
dataset3_preds1.label = MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(
dataset3_preds1.label.values.reshape(-1, 1))
dataset3_preds1.sort_values(by=['coupon_id', 'label'], inplace=True)
dataset3_preds1.to_csv("./xgb_preds.csv", index=None, header=None)
print(dataset3_preds1.describe())
user_id coupon_id date_received label
count 1.128030e+05 112803.000000 1.128030e+05 112803.000000
mean 3.684618e+06 9064.658006 2.016072e+07 0.374507
std 2.126358e+06 4147.283515 9.017693e+00 0.130249
min 2.090000e+02 3.000000 2.016070e+07 0.000000
25% 1.843824e+06 5035.000000 2.016071e+07 0.292860
50% 3.683073e+06 9983.000000 2.016072e+07 0.355278
75% 5.525176e+06 13602.000000 2.016072e+07 0.443395
max 7.361024e+06 14045.000000 2.016073e+07 1.000000
In [22]:
dataset3_preds1.label.head()
Out[22]:
88774 0.201625
58111 0.210430
25100 0.218126
79286 0.224153
59129 0.241302
Name: label, dtype: float32
In [17]:
model = xgb.Booster()
model.load_model('./xgbmodel')
In [18]:
temp = dataset12[['coupon_id', 'label']].copy()
temp['pred'] = model.predict(xgb.DMatrix(dataset12_x))
temp.pred = MinMaxScaler(copy=True, feature_range=(0, 1)).fit_transform(temp['pred'].values.reshape(-1, 1))
print(myauc(temp))
0.7733047598560868
各模型训练
In [33]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
from datetime import date
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.linear_model import SGDClassifier, LogisticRegression
import lightgbm as lgb
获取数据
In [3]:
def get_processed_data():
dataset1 = pd.read_csv('./GenerateData1.csv')
dataset2 = pd.read_csv('./GenerateData2.csv')
dataset3 = pd.read_csv('./GenerateData3.csv')
dataset1.label.replace(-1, 0, inplace=True)
dataset2.label.replace(-1, 0, inplace=True)
dataset1.drop_duplicates(inplace=True)
dataset2.drop_duplicates(inplace=True)
dataset3.drop_duplicates(inplace=True)
# 按照行或列进行合并,axis=0为列索引,axis=1为行索引 因为特征处理都一样, 所以按照列索引
dataset12 = pd.concat([dataset1, dataset2], axis=0)
dataset12.fillna(-1, inplace=True)
# dataset3.fillna(0, inplace=True)
return dataset12, dataset3
In [4]:
dataset12, dataset3 = get_processed_data()
In [5]:
predict_dataset = dataset3[['user_id', 'coupon_id', 'date_received']].copy()
dataset12_label = dataset12.label
# 降低维度, 把没有必要的字段删除
dataset12_x = dataset12.drop(['user_id','label','coupon_id','day_gap_before','day_gap_after'],axis=1)
dataset3.fillna(-1, inplace=True)
dataset3_x = dataset3.drop(['user_id','coupon_id','date_received','day_gap_before','day_gap_after'],axis=1)
数据分割
In [6]:
x_train, x_test, y_train, y_test = train_test_split(dataset12_x, dataset12_label, test_size=0.25, random_state=88)
In [7]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape
Out[7]:
((328240, 52), (109414, 52), (328240,), (109414,))
模型训练
随机森林 score:0.7790
In [10]:
model = RandomForestClassifier(n_estimators=190,
criterion='gini',
bootstrap=True,
max_depth=15,
max_features=24,
min_samples_leaf=5,
oob_score=True,
random_state=0,
n_jobs=-1)
In [11]:
model.fit(x_train, y_train)
Out[11]:
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
criterion='gini', max_depth=15, max_features=24,
max_leaf_nodes=None, max_samples=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=5, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=190,
n_jobs=-1, oob_score=True, random_state=0, verbose=0,
warm_start=False)
In [13]:
model.score(x_test, y_test)
Out[13]:
0.9399071416820517
In [14]:
y_predict_proba = model.predict_proba(x_test)
In [17]:
y_predict_proba[:, 1].itemsize
Out[17]:
8
In [20]:
print("AUC",roc_auc_score(y_test,y_predict_proba[:,1]))
AUC 0.8979076720483452
In [21]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("rdf_preds1.csv",index=None,header=None)
GBDT score:0.7297
In [24]:
model =GradientBoostingClassifier(learning_rate=0.1,
n_estimators=190,
min_samples_split=5,
min_samples_leaf=5,
max_depth=15,
random_state=0,
max_features=24,)
model.fit(x_train, y_train)
Out[24]:
GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
learning_rate=0.1, loss='deviance', max_depth=15,
max_features=24, max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=5, min_samples_split=5,
min_weight_fraction_leaf=0.0, n_estimators=190,
n_iter_no_change=None, presort='deprecated',
random_state=0, subsample=1.0, tol=0.0001,
validation_fraction=0.1, verbose=0,
warm_start=False)
In [25]:
model.score(x_test, y_test)
Out[25]:
0.9373754729742081
In [32]:
x_test.shape
Out[32]:
(109414, 52)
In [28]:
y_predict_proba = model.predict_proba(x_test)
print("AUC准确率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC准确率: 0.8692195079846718
In [30]:
y_predict_proba
Out[30]:
array([[4.16554643e-01, 5.83445357e-01],
[9.98395049e-01, 1.60495139e-03],
[9.87593646e-01, 1.24063544e-02],
...,
[9.46224733e-01, 5.37752665e-02],
[8.65366794e-01, 1.34633206e-01],
[9.99404371e-01, 5.95628598e-04]])
In [31]:
y_predict_proba.itemsize
Out[31]:
8
In [29]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("gbdt_preds2.csv",index=None,header=None)
lightGBM score:0.7869
In [34]:
# 1.boosting_type=‘gbdt’# 提升树的类型 gbdt,dart,goss,rf
# 2.num_leaves=32#树的最大叶子数,对比xgboost一般为2^(max_depth)
# 3.max_depth=-1#最大树的深度
# 4.learning_rate#学习率
# 5.n_estimators=10: 拟合的树的棵树,相当于训练轮数
# 6.subsample=1.0: 训练样本采样率
# 7.colsample_bytree=1.0: 训练特征采样率 列
# 8.subsample_freq=1: 子样本频率
# 9.reg_alpha=0.0: L1正则化系数
# 10.reg_lambda=0.0: L2正则化系数
# 11.random_state=None: 随机种子数
# 12.n_jobs=-1: 并行运行多线程核心数
# 13.silent=True: 训练过程是否打印日志信息
# 14.min_split_gain=0.0: 最小分割增益
# 15.min_child_weight=0.001: 分支结点的最小权重
# 16.sub_feature: LightGBM将在每次迭代(树)中随机选择部分特性 即随机选择70%的特性
model = lgb.LGBMClassifier(
learning_rate = 0.01,
boosting_type = 'gbdt',
objective = 'binary',
metric = 'logloss',
max_depth = 5,
sub_feature = 0.7,
num_leaves = 3,
colsample_bytree = 0.7,
n_estimators = 5000,
early_stop = 50)
In [35]:
model.fit(x_train, y_train)
Out[35]:
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.7,
early_stop=50, importance_type='split', learning_rate=0.01,
max_depth=5, metric='logloss', min_child_samples=20,
min_child_weight=0.001, min_split_gain=0.0, n_estimators=5000,
n_jobs=-1, num_leaves=3, objective='binary', random_state=None,
reg_alpha=0.0, reg_lambda=0.0, silent=True, sub_feature=0.7,
subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
verbose=-1)
In [36]:
model.score(x_test, y_test)
Out[36]:
0.9350448754272762
In [37]:
y_predict_proba = model.predict_proba(x_test)
In [38]:
print("AUC准确率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC准确率: 0.8819782907036887
In [39]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("lightGBM_preds.csv",index=None,header=None)
逻辑回归 score:0.6932
In [49]:
model = LogisticRegression(max_iter=1000, n_jobs=-1, l1_ratio=0.01, random_state=22)
In [50]:
model.fit(x_train, y_train)
Out[50]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=0.01, max_iter=1000,
multi_class='auto', n_jobs=-1, penalty='l2', random_state=22,
solver='lbfgs', tol=0.0001, verbose=0, warm_start=False)
In [51]:
model.score(x_test, y_test)
Out[51]:
0.9256036704626465
In [52]:
y_predict_proba = model.predict_proba(x_test)
In [56]:
print("AUC准确率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC准确率: 0.8190074583111724
In [57]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("LOG_preds1.csv",index=None,header=None)
逻辑回归 SGDClassifier score:0.6119
In [58]:
#fit_intercept:是否计算偏置
model = SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
max_iter=100,
shuffle=True,
alpha = 0.01,
l1_ratio = 0.01,
n_jobs=1)
In [59]:
model.fit(x_train, y_train)
Out[59]:
SGDClassifier(alpha=0.01, average=False, class_weight=None,
early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
l1_ratio=0.01, learning_rate='optimal', loss='log', max_iter=100,
n_iter_no_change=5, n_jobs=1, penalty='elasticnet', power_t=0.5,
random_state=None, shuffle=True, tol=0.001,
validation_fraction=0.1, verbose=0, warm_start=False)
In [60]:
model.score(x_test, y_test)
Out[60]:
0.8991719524009725
In [61]:
y_predict_proba = model.predict_proba(x_test)
In [62]:
print("AUC准确率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC准确率: 0.7653896044259063
In [63]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("SGD_preds1.csv",index=None,header=None)
xgboost模型 XGBClassifier score:0.7551
In [64]:
from xgboost import XGBClassifier
In [65]:
# 可以使用一个参数, 其他参数不变进行调优. 类似于随机森林模型的调优方法
model = XGBClassifier(max_depth=15, learning_rate=0.01,eta=1, gamma=0, n_jobs=-1)
In [66]:
model.fit(x_train, y_train)
Out[66]:
XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eta=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints=None,
learning_rate=0.01, max_delta_step=0, max_depth=15,
min_child_weight=1, missing=nan, monotone_constraints=None,
n_estimators=100, n_jobs=-1, num_parallel_tree=1,
objective='binary:logistic', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
validate_parameters=False, verbosity=None)
In [67]:
model.score(x_test, y_test)
Out[67]:
0.939221671815307
In [68]:
y_predict_proba = model.predict_proba(x_test)
In [69]:
print("AUC准确率:", roc_auc_score(y_test,y_predict_proba[:,1]))
AUC准确率: 0.892734335681119
In [70]:
dataset_preds = dataset3[['user_id','coupon_id','date_received']]
dataset_preds['label'] = model.predict_proba(dataset3_x)[:, 1]
dataset_preds.sort_values(by=['coupon_id','label'],inplace=True)
dataset_preds.to_csv("XGBC_preds1.csv",index=None,header=None)