O2O使用券

import os, sys, pickle

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import seaborn as sns

from datetime import date

from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

import xgboost as xgb
# import lightgbm as lgb

数据分析

# 数据读取
dfoff = pd.read_csv(r'C:\Users\dell\Desktop\o2o使用券\ccf_offline_stage1_train.csv',engine = 'python')
dftest = pd.read_csv(r'C:\Users\dell\Desktop\o2o使用券\ccf_offline_stage1_test_revised.csv',engine = 'python')
dfon = pd.read_csv(r'C:\Users\dell\Desktop\o2o使用券\ccf_online_stage1_train.csv',engine = 'python')
print('data read end.')
data read end.
dfoff.head()#查看前五行数据
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receivedDate
014394082632NaNNaN0.0NaN20160217.0
11439408466311002.0150:201.020160528.0NaN
2143940826328591.020:10.020160217.0NaN
3143940826321078.020:10.020160319.0NaN
4143940826328591.020:10.020160613.0NaN
dfoff.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 7 columns):
User_id          int64
Merchant_id      int64
Coupon_id        float64
Discount_rate    object
Distance         float64
Date_received    float64
Date             float64
dtypes: float64(4), int64(2), object(1)
memory usage: 93.7+ MB
# print('有优惠券,购买商品条数', dfoff[(dfoff['Date_received'] != 'nan') & (dfoff['Date'] != 'nan')].shape[0])
# print('无优惠券,购买商品条数', dfoff[(dfoff['Date_received'] == 'nan') & (dfoff['Date'] != 'nan')].shape[0])
# print('有优惠券,不购买商品条数', dfoff[(dfoff['Date_received'] != 'nan') & (dfoff['Date'] == 'nan')].shape[0])
# print('无优惠券,不购买商品条数', dfoff[(dfoff['Date_received'] == 'nan') & (dfoff['Date'] == 'nan')].shape[0])
print('有优惠券,购买商品条数', dfoff[(pd.notnull(dfoff['Date_received'])) & (pd.notnull(dfoff['Date']))].shape[0])
print('无优惠券,购买商品条数',  dfoff[(pd.isnull(dfoff['Date_received'])) & (pd.notnull(dfoff['Date']))].shape[0])
print('有优惠券,不购买商品条数',  dfoff[(pd.notnull(dfoff['Date_received'])) & (pd.isnull(dfoff['Date']))].shape[0])
print('无优惠券,不购买商品条数', dfoff[(pd.isnull(dfoff['Date_received'])) & (pd.isnull(dfoff['Date']))].shape[0])
有优惠券,购买商品条数 75382
无优惠券,购买商品条数 701602
有优惠券,不购买商品条数 977900
无优惠券,不购买商品条数 0
# 在测试集中出现的用户但训练集没有出现
print('1. User_id in training set but not in test set', set(dftest['User_id']) - set(dfoff['User_id']))
# 在测试集中出现的商户但训练集没有出现
print('2. Merchant_id in training set but not in test set', set(dftest['Merchant_id']) - set(dfoff['Merchant_id']))
1. User_id in training set but not in test set {2495873, 1286474}
2. Merchant_id in training set but not in test set {5920}

优惠券和距离特征

print('Discount_rate类型:',dfoff['Discount_rate'].unique())
Discount_rate类型: [nan '150:20' '20:1' '200:20' '30:5' '50:10' '10:5' '100:10' '200:30'
 '20:5' '30:10' '50:5' '150:10' '100:30' '200:50' '100:50' '300:30'
 '50:20' '0.9' '10:1' '30:1' '0.95' '100:5' '5:1' '100:20' '0.8' '50:1'
 '200:10' '300:20' '100:1' '150:30' '300:50' '20:10' '0.85' '0.6' '150:50'
 '0.75' '0.5' '200:5' '0.7' '30:20' '300:10' '0.2' '50:30' '200:100'
 '150:5']
print('Distance类型:',dfoff['Distance'].unique())
Distance类型: [ 0.  1. nan  2. 10.  4.  7.  9.  3.  5.  6.  8.]

把 xx:bb,类型转换成 比率: 1 - bb/xx ,同时建立折扣券相关的特征:discount_rate,discount_man,discount_jian,discount_type

# 1. 将满xx减yy类型(`xx:yy`)的券变成折扣率 : `1 - yy/xx`,同时建立折扣券相关的特征 `discount_rate, discount_man, discount_jian, discount_type`
# 2. 将距离 `str` 转为 `int`
# convert Discount_rate and Distance
def getDiscountType(row):
    if pd.isnull(row):
        return np.nan
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    """Convert discount to rate"""
    if pd.isnull(row):
        return 1.0
    elif ':' in str(row):
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0
print("tool is ok.")
tool is ok.
def processData(df):
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].apply(convertRate)
    df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
    print(df['discount_rate'].unique())
    # convert distance
    df['distance'] = df['Distance'].fillna(-1).astype(int)
    print(df['distance'].unique())
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)
[1.         0.86666667 0.95       0.9        0.83333333 0.8
 0.5        0.85       0.75       0.66666667 0.93333333 0.7
 0.6        0.96666667 0.98       0.99       0.975      0.33333333
 0.2        0.4       ]
[ 0  1 -1  2 10  4  7  9  3  5  6  8]
[0.83333333 0.9        0.96666667 0.8        0.95       0.75
 0.98       0.5        0.86666667 0.6        0.66666667 0.7
 0.85       0.33333333 0.94       0.93333333 0.975      0.99      ]
[ 1 -1  5  2  0 10  3  6  7  4  9  8]
dfoff.head(2)
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receivedDatediscount_ratediscount_mandiscount_jiandiscount_typedistance
014394082632NaNNaN0.0NaN20160217.01.00000000NaN0
11439408466311002.0150:201.020160528.0NaN0.866667150201.01
dftest.head(2)
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receiveddiscount_ratediscount_mandiscount_jiandiscount_typedistance
04129537450998330:05:001.0201607120.83333330511
169493781300342930:05:00NaN201607060.8333333051-1

时间处理: 优惠券获得时间,优惠券使用时间

# date_received = dfoff['Date_received'].unique()
# date_received = sorted(date_received[date_received != 'null'])

# date_buy = dfoff['Date'].unique()
# date_buy = sorted(date_buy[date_buy !='null'])

# # date_buy = sorted(dfoff[dfoff['Date'] != 'null']['Date'])
# date_buy = sorted(dfoff[dfoff['Date'] != 'null']['Date'])
# # print()

# print('优惠券收到日期从',date_received[0],'到', date_received[-1])
# print('消费日期从', date_buy[0], '到', date_buy[-1])
couponbydate = dfoff[dfoff['Date_received'].notna()][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
# print(couponbydate.head())
couponbydate.columns = ['Date_received','count']
print(couponbydate.head())
buybydate = dfoff[(dfoff['Date'].notna()) & (dfoff['Date_received'].notna())][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
buybydate.columns = ['Date_received','count']

# dfoff['Date_received'].notna()
   Date_received  count
0     20160101.0     74
1     20160102.0     67
2     20160103.0     74
3     20160104.0     98
4     20160105.0    107
# sns.set_style('ticks')
# sns.set_context("notebook", font_scale= 1.4)
# plt.figure(figsize = (12,8))
# date_received_dt = pd.to_datetime('Date_received', format='%Y%m%d %H')
# # date_received_dt = pd.to_datetime('Date_received')

# plt.subplot(211)
# plt.bar(date_received_dt, couponbydate['count'], label = 'number of coupon received' )
# plt.bar(date_received_dt, buybydate['count'], label = 'number of coupon used')
# plt.yscale('log')
# plt.ylabel('Count')
# plt.legend()

# plt.subplot(212)
# plt.bar(date_received_dt, buybydate['count']/couponbydate['count'])
# plt.ylabel('Ratio(coupon used/coupon received)')
# plt.tight_layout()

新建关于星期的特征

  • weekday {1,2,3,4,5,6,7}
  • 周一到周五为 0 ,周六日为 1
def getWeekday(row):
    if row == 'nan': 
        return np.nan
    else:
        return date(int(row[0:4]), int(row[4:6]),int(row[6:8])).weekday() +1
dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)

dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x : 1  if x in [6,7] else 0 )
dftest['weekday_type'] = dftest['weekday'].apply(lambda x : 1  if x in [6,7] else 0 )
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)
['weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']
tmpdf = pd.get_dummies(dfoff['weekday'].replace('null', np.nan))
tmpdf1 = pd.get_dummies(dftest['weekday'].replace('null', np.nan))

tmpdf.columns = weekdaycols
tmpdf1.columns = weekdaycols

dfoff[weekdaycols] = tmpdf
dftest[weekdaycols] = tmpdf1

dfoff[weekdaycols].head()

dftest[weekdaycols].head()
# dfoff.info()
weekday_1weekday_2weekday_3weekday_4weekday_5weekday_6weekday_7
00100000
10010000
20010000
30010000
40000100

对 收到优惠券的行进行分类

  • Date != ‘null’ & Date-Date_received <= 15: y = 1
  • Date_received == ‘null’: y = -1
  • other : y =0

def label(row):
    if pd.isnull(row['Date_received']):
        return -1
    if pd.notnull(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff['label'] = dfoff.apply(label, axis = 1)

print("end")
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
151000
152000
153000
154000
155000
156000
157000
158000
159000
160000
161000
162000
163000
164000
165000
166000
167000
168000
169000
170000
171000
172000
173000
174000
175000
176000
177000
178000
179000
180000
181000
182000
183000
184000
185000
186000
187000
188000
189000
190000
191000
192000
193000
194000
195000
196000
197000
198000
199000
200000
201000
202000
203000
204000
205000
206000
207000
208000
209000
210000
211000
212000
213000
214000
215000
216000
217000
218000
219000
220000
221000
222000
223000
224000
225000
226000
227000
228000
229000
230000
231000
232000
233000
234000
235000
236000
237000
238000
239000
240000
241000
242000
243000
244000
245000
246000
247000
248000
249000
250000
251000
252000
253000
254000
255000
256000
257000
258000
259000
260000
261000
262000
263000
264000
265000
266000
267000
268000
269000
270000
271000
272000
273000
274000
275000
276000
277000
278000
279000
280000
281000
282000
283000
284000
285000
286000
287000
288000
289000
290000
291000
292000
293000
294000
295000
296000
297000
298000
299000
300000
301000
302000
303000
304000
305000
306000
307000
308000
309000
310000
311000
312000
313000
314000
315000
316000
317000
318000
319000
320000
321000
322000
323000
324000
325000
326000
327000
328000
329000
330000
331000
332000
333000
334000
335000
336000
337000
338000
339000
340000
341000
342000
343000
344000
345000
346000
347000
348000
349000
350000
351000
352000
353000
354000
355000
356000
357000
358000
359000
360000
361000
362000
363000
364000
365000
366000
367000
368000
369000
370000
371000
372000
373000
374000
375000
376000
377000
378000
379000
380000
381000
382000
383000
384000
385000
386000
387000
388000
389000
390000
391000
392000
393000
394000
395000
396000
397000
398000
399000
400000
401000
402000
403000
404000
405000
406000
407000
408000
409000
410000
411000
412000
413000
414000
415000
416000
417000
418000
419000
420000
421000
422000
423000
424000
425000
426000
427000
428000
429000
430000
431000
432000
433000
434000
435000
436000
437000
438000
439000
440000
441000
442000
443000
444000
445000
446000
447000
448000
449000
450000
451000
452000
453000
454000
455000
456000
457000
458000
459000
460000
461000
462000
463000
464000
465000
466000
467000
468000
469000
470000
471000
472000
473000
474000
475000
476000
477000
478000
479000
480000
481000
482000
483000
484000
485000
486000
487000
488000
489000
490000
491000
492000
493000
494000
495000
496000
497000
498000
499000
500000
501000
502000
503000
504000
505000
506000
507000
508000
509000
510000
511000
512000
513000
514000
515000
516000
517000
518000
519000
520000
521000
522000
523000
524000
525000
526000
527000
528000
529000
530000
531000
532000
533000
534000
535000
536000
537000
538000
539000
540000
541000
542000
543000
544000
545000
546000
547000
548000
549000
550000
551000
552000
553000
554000
555000
556000
557000
558000
559000
560000
561000
562000
563000
564000
565000
566000
567000
568000
569000
570000
571000
572000
573000
574000
575000
576000
577000
578000
579000
580000
581000
582000
583000
584000
585000
586000
587000
588000
589000
590000
591000
592000
593000
594000
595000
596000
597000
598000
599000
600000
601000
602000
603000
604000
605000
606000
607000
608000
609000
610000
611000
612000
613000
614000
615000
616000
617000
618000
619000
620000
621000
622000
623000
624000
625000
626000
627000
628000
629000
630000
631000
632000
633000
634000
635000
636000
637000
638000
639000
640000
641000
642000
643000
644000
645000
646000
647000
648000
649000
650000
651000
652000
653000
654000
655000
656000
657000
658000
659000
660000
661000
662000
663000
664000
665000
666000
667000
668000
669000
670000
671000
672000
673000
674000
675000
676000
677000
678000
679000
680000
681000
682000
683000
684000
685000
686000
687000
688000
689000
690000
691000
692000
693000
694000
695000
696000
697000
698000
699000
700000
701000
702000
703000
704000
705000
706000
707000
708000
709000
710000
711000
712000
713000
714000
715000
716000
717000
718000
719000
720000
721000
722000
723000
724000
725000
726000
727000
728000
729000
730000
731000
732000
733000
734000
735000
736000
737000
738000
739000
740000
741000
742000
743000
744000
745000
746000
747000
748000
749000
750000
751000
752000
753000
754000
755000
756000
757000
758000
759000
760000
761000
762000
763000
764000
765000
766000
767000
768000
769000
770000
771000
772000
773000
774000
775000
776000
777000
778000
779000
780000
781000
782000
783000
784000
785000
786000
787000
788000
789000
790000
791000
792000
793000
794000
795000
796000
797000
798000
799000
800000
801000
802000
803000
804000
805000
806000
807000
808000
809000
810000
811000
812000
813000
814000
815000
816000
817000
818000
819000
820000
821000
822000
823000
824000
825000
826000
827000
828000
829000
830000
831000
832000
833000
834000
835000
836000
837000
838000
839000
840000
841000
842000
843000
844000
845000
846000
847000
848000
849000
850000
851000
852000
853000
854000
855000
856000
857000
858000
859000
860000
861000
862000
863000
864000
865000
866000
867000
868000
869000
870000
871000
872000
873000
874000
875000
876000
877000
878000
879000
880000
881000
882000
883000
884000
885000
886000
887000
888000
889000
890000
891000
892000
893000
894000
895000
896000
897000
898000
899000
900000
901000
902000
903000
904000
905000
906000
907000
908000
909000
910000
911000
912000
913000
914000
915000
916000
917000
918000
919000
920000
921000
922000
923000
924000
925000
926000
927000
928000
929000
930000
931000
932000
933000
934000
935000
936000
937000
938000
939000
940000
941000
942000
943000
944000
945000
946000
947000
948000
949000
950000
951000
952000
953000
954000
955000
956000
957000
958000
959000
960000
961000
962000
963000
964000
965000
966000
967000
968000
969000
970000
971000
972000
973000
974000
975000
976000
977000
978000
979000
980000
981000
982000
983000
984000
985000
986000
987000
988000
989000
990000
991000
992000
993000
994000
995000
996000
997000
998000
999000
1000000
1001000
1002000
1003000
1004000
1005000
1006000
1007000
1008000
1009000
1010000
1011000
1012000
1013000
1014000
1015000
1016000
1017000
1018000
1019000
1020000
1021000
1022000
1023000
1024000
1025000
1026000
1027000
1028000
1029000
1030000
1031000
1032000
1033000
1034000
1035000
1036000
1037000
1038000
1039000
1040000
1041000
1042000
1043000
1044000
1045000
1046000
1047000
1048000
1049000
1050000
1051000
1052000
1053000
1054000
1055000
1056000
1057000
1058000
1059000
1060000
1061000
1062000
1063000
1064000
1065000
1066000
1067000
1068000
1069000
1070000
1071000
1072000
1073000
1074000
1075000
1076000
1077000
1078000
1079000
1080000
1081000
1082000
1083000
1084000
1085000
1086000
1087000
1088000
1089000
1090000
1091000
1092000
1093000
1094000
1095000
1096000
1097000
1098000
1099000
1100000
1101000
1102000
1103000
1104000
1105000
1106000
1107000
1108000
1109000
1110000
1111000
1112000
1113000
1114000
1115000
1116000
1117000
1118000
1119000
1120000
1121000
1122000
1123000
1124000
1125000
1126000
1127000
1128000
1129000
1130000
1131000
1132000
1133000
1134000
1135000
1136000
1137000
1138000
1139000
1140000
1141000
1142000
1143000
1144000
1145000
1146000
1147000
1148000
1149000
1150000
1151000
1152000
1153000
1154000
1155000
1156000
1157000
1158000
1159000
1160000
1161000
1162000
1163000
1164000
1165000
1166000
1167000
1168000
1169000
1170000
1171000
1172000
1173000
1174000
1175000
1176000
1177000
1178000
1179000
1180000
1181000
1182000
1183000
1184000
1185000
1186000
1187000
1188000
1189000
1190000
1191000
1192000
1193000
1194000
1195000
1196000
1197000
1198000
1199000
1200000
1201000
1202000
1203000
1204000
1205000
1206000
1207000
1208000
1209000
1210000
1211000
1212000
1213000
1214000
1215000
1216000
1217000
1218000
1219000
1220000
1221000
1222000
1223000
1224000
1225000
1226000
1227000
1228000
1229000
1230000
1231000
1232000
1233000
1234000
1235000
1236000
1237000
1238000
1239000
1240000
1241000
1242000
1243000
1244000
1245000
1246000
1247000
1248000
1249000
1250000
1251000
1252000
1253000
1254000
1255000
1256000
1257000
1258000
1259000
1260000
1261000
1262000
1263000
1264000
1265000
1266000
1267000
1268000
1269000
1270000
1271000
1272000
1273000
1274000
1275000
1276000
1277000
1278000
1279000
1280000
1281000
1282000
1283000
1284000
1285000
1286000
1287000
1288000
1289000
1290000
1291000
1292000
1293000
1294000
1295000
1296000
1297000
1298000
1299000
1300000
1301000
1302000
1303000
1304000
1305000
1306000
1307000
1308000
1309000
1310000
1311000
1312000
1313000
1314000
1315000
1316000
1317000
1318000
1319000
1320000
1321000
1322000
1323000
1324000
1325000
1326000
1327000
1328000
1329000
1330000
1331000
1332000
1333000
1334000
1335000
1336000
1337000
1338000
1339000
1340000
1341000
1342000
1343000
1344000
1345000
1346000
1347000
1348000
1349000
1350000
1351000
1352000
1353000
1354000
1355000
1356000
1357000
1358000
1359000
1360000
1361000
1362000
1363000
1364000
1365000
1366000
1367000
1368000
1369000
1370000
1371000
1372000
1373000
1374000
1375000
1376000
1377000
1378000
1379000
1380000
1381000
1382000
1383000
1384000
1385000
1386000
1387000
1388000
1389000
1390000
1391000
1392000
1393000
1394000
1395000
1396000
1397000
1398000
1399000
1400000
1401000
1402000
1403000
1404000
1405000
1406000
1407000
1408000
1409000
1410000
1411000
1412000
1413000
1414000
1415000
1416000
1417000
1418000
1419000
1420000
1421000
1422000
1423000
1424000
1425000
1426000
1427000
1428000
1429000
1430000
1431000
1432000
1433000
1434000
1435000
1436000
1437000
1438000
1439000
1440000
1441000
1442000
1443000
1444000
1445000
1446000
1447000
1448000
1449000
1450000
1451000
1452000
1453000
1454000
1455000
1456000
1457000
1458000
1459000
1460000
1461000
1462000
1463000
1464000
1465000
1466000
1467000
1468000
1469000
1470000
1471000
1472000
1473000
1474000
1475000
1476000
1477000
1478000
1479000
1480000
1481000
1482000
1483000
1484000
1485000
1486000
1487000
1488000
1489000
1490000
1491000
1492000
1493000
1494000
1495000
1496000
1497000
1498000
1499000
1500000
1501000
1502000
1503000
1504000
1505000
1506000
1507000
1508000
1509000
1510000
1511000
1512000
1513000
1514000
1515000
1516000
1517000
1518000
1519000
1520000
1521000
1522000
1523000
1524000
1525000
1526000
1527000
1528000
1529000
1530000
1531000
1532000
1533000
1534000
1535000
1536000
1537000
1538000
1539000
1540000
1541000
1542000
1543000
1544000
1545000
1546000
1547000
1548000
1549000
1550000
1551000
1552000
1553000
1554000
1555000
1556000
1557000
1558000
1559000
1560000
1561000
1562000
1563000
1564000
1565000
1566000
1567000
1568000
1569000
1570000
1571000
1572000
1573000
1574000
1575000
1576000
1577000
1578000
1579000
1580000
1581000
1582000
1583000
1584000
1585000
1586000
1587000
1588000
1589000
1590000
1591000
1592000
1593000
1594000
1595000
1596000
1597000
1598000
1599000
1600000
1601000
1602000
1603000
1604000
1605000
1606000
1607000
1608000
1609000
1610000
1611000
1612000
1613000
1614000
1615000
1616000
1617000
1618000
1619000
1620000
1621000
1622000
1623000
1624000
1625000
1626000
1627000
1628000
1629000
1630000
1631000
1632000
1633000
1634000
1635000
1636000
1637000
1638000
1639000
1640000
1641000
1642000
1643000
1644000
1645000
1646000
1647000
1648000
1649000
1650000
1651000
1652000
1653000
1654000
1655000
1656000
1657000
1658000
1659000
1660000
1661000
1662000
1663000
1664000
1665000
1666000
1667000
1668000
1669000
1670000
1671000
1672000
1673000
1674000
1675000
1676000
1677000
1678000
1679000
1680000
1681000
1682000
1683000
1684000
1685000
1686000
1687000
1688000
1689000
1690000
1691000
1692000
1693000
1694000
1695000
1696000
1697000
1698000
1699000
1700000
1701000
1702000
1703000
1704000
1705000
1706000
1707000
1708000
1709000
1710000
1711000
1712000
1713000
1714000
1715000
1716000
1717000
1718000
1719000
1720000
1721000
1722000
1723000
1724000
1725000
1726000
1727000
1728000
1729000
1730000
1731000
1732000
1733000
1734000
1735000
1736000
1737000
1738000
1739000
1740000
1741000
1742000
1743000
1744000
1745000
1746000
1747000
1748000
1749000
1750000
1751000
1752000
1753000
1754000
end
print(dfoff['label'].value_counts())
 0    988887
-1    701602
 1     64395
Name: label, dtype: int64
print('已有columns:',dfoff.columns.tolist())
已有columns: ['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance', 'Date_received', 'Date', 'discount_rate', 'discount_man', 'discount_jian', 'discount_type', 'distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7', 'label']
dfoff.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 22 columns):
User_id          int64
Merchant_id      int64
Coupon_id        float64
Discount_rate    object
Distance         float64
Date_received    float64
Date             float64
discount_rate    float64
discount_man     int64
discount_jian    int64
discount_type    float64
distance         int32
weekday          float64
weekday_type     int64
weekday_1        uint8
weekday_2        uint8
weekday_3        uint8
weekday_4        uint8
weekday_5        uint8
weekday_6        uint8
weekday_7        uint8
label            int64
dtypes: float64(7), int32(1), int64(6), object(1), uint8(7)
memory usage: 205.9+ MB
dfoff.head()
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receivedDatediscount_ratediscount_mandiscount_jian...weekdayweekday_typeweekday_1weekday_2weekday_3weekday_4weekday_5weekday_6weekday_7label
014394082632NaNNaN0.0NaN20160217.01.00000000...NaN00000000-1
11439408466311002.0150:201.020160528.0NaN0.86666715020...6.0100000100
2143940826328591.020:10.020160217.0NaN0.950000201...3.0000100000
3143940826321078.020:10.020160319.0NaN0.950000201...6.0100000100
4143940826328591.020:10.020160613.0NaN0.950000201...1.0010000000

5 rows × 22 columns

dfoff.count()
User_id          1754884
Merchant_id      1754884
Coupon_id        1053282
Discount_rate    1053282
Distance         1648881
Date_received    1053282
Date              776984
discount_rate    1754884
discount_man     1754884
discount_jian    1754884
discount_type    1053282
distance         1754884
weekday          1053282
weekday_type     1754884
weekday_1        1754884
weekday_2        1754884
weekday_3        1754884
weekday_4        1754884
weekday_5        1754884
weekday_6        1754884
weekday_7        1754884
label            1754884
dtype: int64
# data split
df = dfoff[dfoff['label'] != -1].copy()
train = df[(df['Date_received'] < 20160516)].copy()
valid = df[(df['Date_received'] >= 20160516) & (df['Date_received'] <= 20160615)].copy()
print(train['label'].value_counts())
print(valid['label'].value_counts())
0    759172
1     41524
Name: label, dtype: int64
0    229715
1     22871
Name: label, dtype: int64
# feature
original_feature = ['discount_rate','discount_type','discount_man', 'discount_jian','distance', 'weekday', 'weekday_type'] + weekdaycols
print(len(original_feature),original_feature)
14 ['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']
# model1
predictors = original_feature
print(predictors)

def check_model(data, predictors):
    
    classifier = lambda: SGDClassifier(
        loss='log', 
        penalty='elasticnet', 
        fit_intercept=True, 
        max_iter=100, 
        shuffle=True, 
        n_jobs=1,
        class_weight=None)

    model = Pipeline(steps=[
        ('ss', StandardScaler()),
        ('en', classifier())
    ])

    parameters = {
        'en__alpha': [ 0.001, 0.01, 0.1],
        'en__l1_ratio': [ 0.001, 0.01, 0.1]
    }

    folder = StratifiedKFold(n_splits=3, shuffle=True)
    
    grid_search = GridSearchCV(
        model, 
        parameters, 
        cv=folder, 
        n_jobs=-1, 
        verbose=1)
    grid_search = grid_search.fit(data[predictors], 
                                  data['label'])
    
    return grid_search

if not os.path.isfile('1_model.pkl'):
    model = check_model(train, predictors)
    print(model.best_score_)
    print(model.best_params_)
    with open('1_model.pkl', 'wb') as f:
        pickle.dump(model, f)
else:
    with open('1_model.pkl', 'rb') as f:
        model = pickle.load(f)
['discount_rate', 'discount_type', 'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7']
# valid predict
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]
valid1.head(2)
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receivedDatediscount_ratediscount_mandiscount_jian...weekday_typeweekday_1weekday_2weekday_3weekday_4weekday_5weekday_6weekday_7labelpred_prob
11439408466311002.0150:201.020160528.0NaN0.86666715020...1000001000.018534
4143940826328591.020:10.020160613.0NaN0.950000201...0100000000.140446

2 rows × 23 columns

# avgAUC calculation
vg = valid1.groupby(['Coupon_id'])
aucs = []
for i in vg:
    tmpdf = i[1] 
    if len(tmpdf['label'].unique()) != 2:
        continue
    fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
    aucs.append(auc(fpr, tpr))
print(np.average(aucs))
0.5351686740665353
print("----train-----")
model = SGDClassifier(#lambda:
    loss='log',
    penalty='elasticnet',
    fit_intercept=True,
    max_iter=100,
    shuffle=True,
    alpha = 0.01,
    l1_ratio = 0.01,
    n_jobs=1,
    class_weight=None
)
model.fit(train[original_feature], train['label'])
----train-----





SGDClassifier(alpha=0.01, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.01,
       learning_rate='optimal', loss='log', max_iter=100, n_iter=None,
       n_jobs=1, penalty='elasticnet', power_t=0.5, random_state=None,
       shuffle=True, tol=None, verbose=0, warm_start=False)
# #### 预测以及结果评价
print(model.score(valid[original_feature], valid['label']))
0.9094526220772331
# test prediction for submission
y_test_pred = model.predict_proba(dftest[original_feature])
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['label'] = y_test_pred[:,1]
dftest1.to_csv('submit2.csv', index=False, header=False)
dftest1.head()
User_idCoupon_idDate_receivedlabel
041295379983201607120.096343
169493783429201607060.139846
221665296928201607270.002833
321665291808201607270.013445
461721626500201607080.070036
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值