天池入门赛代码 baseline
第一次打比赛 使用随机森林做了baseline 后续会有升级版本
主要问题是还没有处理数据不平衡问题 而且在对数据的处理部分也并不是很好
最大的疑问是线下测试 AUC高达 0.998 为什么这么高???
如果这样 直接冲击NO.1了 哈哈哈 想多了 想多了
总结下:纸上得来终觉浅 还要多实践
数据处理部分参考博客点这里
import pandas as pd
import numpy as np
% matplotlib inline
train_online = pd. read_csv( 'G:\毕业设计\TianChio2o\ccf_online_stage1_train.csv' , engine= "python" )
train_offline = pd. read_csv( 'G:\毕业设计\TianChio2o\ccf_offline_stage1_train.csv' , engine= "python" )
test = pd. read_csv( 'G:\毕业设计\TianChio2o\ccf_offline_stage1_test_revised.csv' , engine= "python" )
train_online. head( 10 )
User_id Merchant_id Action Coupon_id Discount_rate Date_received Date 0 13740231 18907 2 100017492 500:50 20160513.0 NaN 1 13740231 34805 1 NaN NaN NaN 20160321.0 2 14336199 18907 0 NaN NaN NaN 20160618.0 3 14336199 18907 0 NaN NaN NaN 20160618.0 4 14336199 18907 0 NaN NaN NaN 20160618.0 5 14336199 18907 0 NaN NaN NaN 20160618.0 6 14336199 18907 0 NaN NaN NaN 20160618.0 7 14336199 18907 0 NaN NaN NaN 20160618.0 8 14336199 18907 0 NaN NaN NaN 20160618.0 9 14336199 18907 0 NaN NaN NaN 20160618.0
train_offline. head( 10 )
User_id Merchant_id Coupon_id Discount_rate Distance Date_received Date 0 1439408 2632 NaN NaN 0.0 NaN 20160217.0 1 1439408 4663 11002.0 150:20 1.0 20160528.0 NaN 2 1439408 2632 8591.0 20:1 0.0 20160217.0 NaN 3 1439408 2632 1078.0 20:1 0.0 20160319.0 NaN 4 1439408 2632 8591.0 20:1 0.0 20160613.0 NaN 5 1439408 2632 NaN NaN 0.0 NaN 20160516.0 6 1439408 2632 8591.0 20:1 0.0 20160516.0 20160613.0 7 1832624 3381 7610.0 200:20 0.0 20160429.0 NaN 8 2029232 3381 11951.0 200:20 1.0 20160129.0 NaN 9 2029232 450 1532.0 30:5 0.0 20160530.0 NaN
test. head( )
User_id Merchant_id Coupon_id Discount_rate Distance Date_received 0 4129537 450 9983 30:5 1.0 20160712 1 6949378 1300 3429 30:5 NaN 20160706 2 2166529 7113 6928 200:20 5.0 20160727 3 2166529 7113 1808 100:10 5.0 20160727 4 6172162 7605 6500 30:1 2.0 20160708
train_offline. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 7 columns):
User_id int64
Merchant_id int64
Coupon_id float64
Discount_rate object
Distance float64
Date_received float64
Date float64
dtypes: float64(4), int64(2), object(1)
memory usage: 93.7+ MB
test. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113640 entries, 0 to 113639
Data columns (total 6 columns):
User_id 113640 non-null int64
Merchant_id 113640 non-null int64
Coupon_id 113640 non-null int64
Discount_rate 113640 non-null object
Distance 101576 non-null float64
Date_received 113640 non-null int64
dtypes: float64(1), int64(4), object(1)
memory usage: 5.2+ MB
train_online. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11429826 entries, 0 to 11429825
Data columns (total 7 columns):
User_id int64
Merchant_id int64
Action int64
Coupon_id object
Discount_rate object
Date_received float64
Date float64
dtypes: float64(2), int64(3), object(2)
memory usage: 610.4+ MB
all_offline = pd. concat( [ train_offline, test] , sort= False )
all_offline. head( )
User_id Merchant_id Coupon_id Discount_rate Distance Date_received Date 0 1439408 2632 NaN NaN 0.0 NaN 20160217.0 1 1439408 4663 11002.0 150:20 1.0 20160528.0 NaN 2 1439408 2632 8591.0 20:1 0.0 20160217.0 NaN 3 1439408 2632 1078.0 20:1 0.0 20160319.0 NaN 4 1439408 2632 8591.0 20:1 0.0 20160613.0 NaN
all_offline. apply ( lambda x : sum ( x. isnull( ) ) )
User_id 0
Merchant_id 0
Coupon_id 701602
Discount_rate 701602
Distance 118067
Date_received 701602
Date 1091540
dtype: int64
all_offline[ 'Date' ] = all_offline[ "Date" ] . fillna( "null" )
pd. merge( all_offline, train_online, on= [ 'User_id' , 'Merchant_id' ] )
all_offline. head( )
User_id Merchant_id Coupon_id Discount_rate Distance Date_received Date 0 1439408 2632 NaN NaN 0.0 NaN 2.01602e+07 1 1439408 4663 11002.0 150:20 1.0 20160528.0 null 2 1439408 2632 8591.0 20:1 0.0 20160217.0 null 3 1439408 2632 1078.0 20:1 0.0 20160319.0 null 4 1439408 2632 8591.0 20:1 0.0 20160613.0 null
all_offline = all_offline. fillna( "null" )
def is_used ( column) :
if column[ "Coupon_id" ] != 'null' and column[ "Date" ] != 'null' :
return 1
elif column[ "Coupon_id" ] != "null" and column[ "Date" ] == 'null' :
return - 1
else :
return 0
all_offline[ "is_used" ] = all_offline. apply ( is_used, axis= 1 )
all_offline. head( 10 )
User_id Merchant_id Coupon_id Discount_rate Distance Date_received Date is_used 0 1439408 2632 null null 0 null 2.01602e+07 0 1 1439408 4663 11002 150:20 1 2.01605e+07 null -1 2 1439408 2632 8591 20:1 0 2.01602e+07 null -1 3 1439408 2632 1078 20:1 0 2.01603e+07 null -1 4 1439408 2632 8591 20:1 0 2.01606e+07 null -1 5 1439408 2632 null null 0 null 2.01605e+07 0 6 1439408 2632 8591 20:1 0 2.01605e+07 2.01606e+07 1 7 1832624 3381 7610 200:20 0 2.01604e+07 null -1 8 2029232 3381 11951 200:20 1 2.01601e+07 null -1 9 2029232 450 1532 30:5 0 2.01605e+07 null -1
all_offline[ "has_coupopn" ] = all_offline. apply ( lambda x: 1 if x[ "Coupon_id" ] != "null" else 0 , axis= 1 )
import re
regex = re. compile ( r'^\d+:\d+$' )
def percent ( column) :
if column[ "Coupon_id" ] == "null" or column[ "Discount_rate" ] == 'null' :
return "null"
elif re. match( regex, column[ "Discount_rate" ] ) :
num_max , num_min = column[ "Discount_rate" ] . split( ":" )
return float ( num_min) / float ( num_max)
else :
return column[ "Discount_rate" ]
all_offline[ "discount_percent" ] = all_offline. apply ( percent, axis= 1 )
def discount_limit ( column) :
if column[ "Coupon_id" ] == 'null' or column[ "Discount_rate" ] == 'null' :
return "null"
elif re. match( regex, column[ "Discount_rate" ] ) :
num_max, mun_min = column[ "Discount_rate" ] . split( ":" )
return num_max
else :
return 0
all_offline[ "discount_limit" ] = all_offline. apply ( discount_limit, axis = 1 )
all_offline. head( 10 )
User_id Merchant_id Coupon_id Discount_rate Distance Date_received Date is_used has_coupopn discount_percent discount_limit 0 1439408 2632 null null 0 null 2.01602e+07 0 0 null null 1 1439408 4663 11002 150:20 1 2.01605e+07 null -1 1 0.133333 150 2 1439408 2632 8591 20:1 0 2.01602e+07 null -1 1 0.05 20 3 1439408 2632 1078 20:1 0 2.01603e+07 null -1 1 0.05 20 4 1439408 2632 8591 20:1 0 2.01606e+07 null -1 1 0.05 20 5 1439408 2632 null null 0 null 2.01605e+07 0 0 null null 6 1439408 2632 8591 20:1 0 2.01605e+07 2.01606e+07 1 1 0.05 20 7 1832624 3381 7610 200:20 0 2.01604e+07 null -1 1 0.1 200 8 2029232 3381 11951 200:20 1 2.01601e+07 null -1 1 0.1 200 9 2029232 450 1532 30:5 0 2.01605e+07 null -1 1 0.166667 30
'''
datetime.datetime.strptime(column["Date"],'%Y%M%d') strptime参数1 是字符串型 参数二 是匹配的
'''
import datetime
def used_in_15day ( column) :
if column[ "is_used" ] == 1 and column[ "Date_received" ] != "null" and column[ "Date" ] != "null" :
days = ( datetime. datetime. strptime( column[ "Date" ] , '%Y%M%d' ) - datetime. datetime. strptime( column[ "Date_received" ] , '%Y%M%d' ) ) . days
if days < 15 :
return 1
else :
return 0
else :
return "null"
all_offline[ "Date" ] = all_offline[ "Date" ] . apply ( lambda x: str ( int ( x) ) if x!= "null" else x)
all_offline[ "Date_received" ] = all_offline[ "Date_received" ] . apply ( lambda x: str ( int ( x) ) if x!= "null" else x)
all_offline[ "is_used_in_15day" ] = all_offline. apply ( used_in_15day, axis= 1 )
'''
null 1793142
1 71875
0 3507
我认为可以再多一类 即将null 看为第三类 先2分吧
'''
all_offline[ "is_used_in_15day" ] = all_offline[ "is_used_in_15day" ] . apply ( lambda x : 0 if x== "null" else x)
all_offline[ "is_used_in_15day" ] . value_counts( )
0 1796649
1 71875
Name: is_used_in_15day, dtype: int64
all_offline[ "is_used_in_15day" ] . value_counts( )
0 1796649
1 71875
Name: is_used_in_15day, dtype: int64
all_offline. head( 20 )
User_id Merchant_id Coupon_id Discount_rate Distance Date_received Date is_used has_coupopn discount_percent discount_limit is_used_in_15day 0 1439408 2632 null null 0 null 20160217 0 0 null null 0 1 1439408 4663 11002 150:20 1 20160528 null -1 1 0.133333 150 0 2 1439408 2632 8591 20:1 0 20160217 null -1 1 0.05 20 0 3 1439408 2632 1078 20:1 0 20160319 null -1 1 0.05 20 0 4 1439408 2632 8591 20:1 0 20160613 null -1 1 0.05 20 0 5 1439408 2632 null null 0 null 20160516 0 0 null null 0 6 1439408 2632 8591 20:1 0 20160516 20160613 1 1 0.05 20 1 7 1832624 3381 7610 200:20 0 20160429 null -1 1 0.1 200 0 8 2029232 3381 11951 200:20 1 20160129 null -1 1 0.1 200 0 9 2029232 450 1532 30:5 0 20160530 null -1 1 0.166667 30 0 10 2029232 6459 12737 20:1 0 20160519 null -1 1 0.05 20 0 11 2029232 6459 null null 0 null 20160626 0 0 null null 0 12 2029232 6459 null null 0 null 20160519 0 0 null null 0 13 2747744 6901 1097 50:10 null 20160606 null -1 1 0.2 50 0 14 196342 1579 null null 1 null 20160606 0 0 null null 0 15 196342 1579 10698 20:1 1 20160606 null -1 1 0.05 20 0 16 2223968 3381 9776 10:5 2 20160129 null -1 1 0.5 10 0 17 73611 2099 12034 100:10 null 20160207 null -1 1 0.1 100 0 18 163606 1569 5054 200:30 10 20160421 null -1 1 0.15 200 0 19 3273056 4833 7802 200:20 10 20160130 null -1 1 0.1 200 0
all_offline[ "discount_percent" ] . value_counts( )
null 701602
0.1 403589
0.16666666666666666 330255
0.25 103748
0.05 69099
0.2 56084
0.3 38425
0.15 29585
0.5 28785
0.03333333333333333 22195
0.95 21559
0.13333333333333333 17685
0.3333333333333333 13497
0.9 8912
0.4 8300
0.06666666666666667 5452
0.8 4176
0.02 3693
0.85 650
0.01 551
0.5 196
0.75 121
0.2 110
0.025 75
0.6 59
0.6666666666666666 56
0.7 55
0.6 9
0.06 1
Name: discount_percent, dtype: int64
def discount_percent_layer ( column) :
if column == "null" :
return "null"
column = float ( column)
if column <= 0.1 :
return 0.1
elif column <= 0.2 :
return 0.2
elif column <= 0.3 :
return 0.3
elif column <= 0.4 :
return 0.4
else :
return 0.5
all_offline[ "discount_percent_layer" ] = all_offline[ "discount_percent" ] . apply ( discount_percent_layer)
all_offline[ "discount_percent_layer" ] . value_counts( )
null 701602
0.1 504655
0.2 433719
0.3 142173
0.5 64578
0.4 21797
Name: discount_percent_layer, dtype: int64
def discount_limit_layer ( column) :
if column == "null" :
return "null"
column = float ( column)
if column <= 10 :
return 10
elif column <= 20 :
return 20
elif column <= 30 :
return 30
elif column < 50 :
return 50
elif column <= 100 :
return 100
elif column <= 200 :
return 200
else :
return 300
all_offline[ "discount_limit_layer" ] = all_offline[ "discount_limit" ] . apply ( discount_limit_layer)
all_offline[ "discount_limit_layer" ] . value_counts( )
null 701602
30 364855
100 347253
200 177333
20 161645
10 86406
300 29430
Name: discount_limit_layer, dtype: int64
all_offline. head( 10 )
User_id Merchant_id Coupon_id Discount_rate Distance Date_received Date is_used has_coupopn discount_percent discount_limit is_used_in_15day discount_percent_layer discount_limit_layer 0 1439408 2632 null null 0 null 20160217 0 0 null null 0 null null 1 1439408 4663 11002 150:20 1 20160528 null -1 1 0.133333 150 0 0.2 200 2 1439408 2632 8591 20:1 0 20160217 null -1 1 0.05 20 0 0.1 20 3 1439408 2632 1078 20:1 0 20160319 null -1 1 0.05 20 0 0.1 20 4 1439408 2632 8591 20:1 0 20160613 null -1 1 0.05 20 0 0.1 20 5 1439408 2632 null null 0 null 20160516 0 0 null null 0 null null 6 1439408 2632 8591 20:1 0 20160516 20160613 1 1 0.05 20 1 0.1 20 7 1832624 3381 7610 200:20 0 20160429 null -1 1 0.1 200 0 0.1 200 8 2029232 3381 11951 200:20 1 20160129 null -1 1 0.1 200 0 0.1 200 9 2029232 450 1532 30:5 0 20160530 null -1 1 0.166667 30 0 0.2 30
'''
coupon_id 被处理为: has_coupon 1表示有优惠券
Date,Date_received 被处理为:is_used_in_15day 1表示15天内使用过
Discount_rate 被处理为:discount_percent和discount_limit
merchant_id use_id 是unicode值 不需要进行处理
最后处理下Discount
'''
all_offline[ "Distance" ] . value_counts( )
0.0 869937
1.0 245695
10.0 216837
2.0 127936
null 118067
3.0 82532
4.0 59091
5.0 44429
6.0 34662
7.0 27366
8.0 22795
9.0 19177
Name: Distance, dtype: int64
train_finally, test_finally = all_offline[ : train_offline. shape[ 0 ] ] , all_offline[ train_offline. shape[ 0 ] : ]
file1 = open ( "G:\TianChio2o\\all_offline.csv" , "w" )
all_offline. to_csv( file1)
file2 = open ( "G:\TianChio2o\\train_finally.csv" , "w" )
train_finally. to_csv( file2)
file3 = open ( "G:\TianChio2o\\test_finally.csv" , "w" )
test_finally. to_csv( file3)
print ( all_offline. columns)
Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
'Date_received', 'Date', 'is_used', 'has_coupopn', 'discount_percent',
'discount_limit', 'is_used_in_15day', 'discount_percent_layer',
'discount_limit_layer'],
dtype='object')
print ( all_offline. columns)
all_offline_new = all_offline[ [ 'Distance' , 'is_used' , 'has_coupopn' , 'is_used_in_15day' , 'discount_percent_layer' , 'discount_limit_layer' ] ]
all_offline_new = pd. get_dummies( all_offline_new, dummy_na= True )
Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
'Date_received', 'Date', 'is_used', 'has_coupopn', 'discount_percent',
'discount_limit', 'is_used_in_15day', 'discount_percent_layer',
'discount_limit_layer'],
dtype='object')
all_offline_new. head( )
is_used has_coupopn is_used_in_15day Distance_0.0 Distance_1.0 Distance_2.0 Distance_3.0 Distance_4.0 Distance_5.0 Distance_6.0 ... discount_percent_layer_null discount_percent_layer_nan discount_limit_layer_10 discount_limit_layer_20 discount_limit_layer_30 discount_limit_layer_100 discount_limit_layer_200 discount_limit_layer_300 discount_limit_layer_null discount_limit_layer_nan 0 0 0 0 1 0 0 0 0 0 0 ... 1 0 0 0 0 0 0 0 1 0 1 -1 1 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 1 0 0 0 2 -1 1 0 1 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0 3 -1 1 0 1 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0 4 -1 1 0 1 0 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
5 rows × 31 columns
train_, test_ = all_offline_new[ : train_offline. shape[ 0 ] ] , all_offline_new[ train_offline. shape[ 0 ] : ]
train_ = train_[ train_[ "has_coupopn" ] == 1 ]
train_= train_. drop( [ "has_coupopn" ] , axis= 1 )
test_= test_. drop( [ "has_coupopn" ] , axis= 1 )
x_train= train_. drop( [ "is_used_in_15day" ] , axis= 1 )
y_train= pd. DataFrame( { "is_used_in_15day" : train_[ "is_used_in_15day" ] } )
x_test= test_. drop( [ "is_used_in_15day" ] , axis= 1 )
print ( x_test. columns)
Index(['is_used', 'Distance_0.0', 'Distance_1.0', 'Distance_2.0',
'Distance_3.0', 'Distance_4.0', 'Distance_5.0', 'Distance_6.0',
'Distance_7.0', 'Distance_8.0', 'Distance_9.0', 'Distance_10.0',
'Distance_null', 'Distance_nan', 'discount_percent_layer_0.1',
'discount_percent_layer_0.2', 'discount_percent_layer_0.3',
'discount_percent_layer_0.4', 'discount_percent_layer_0.5',
'discount_percent_layer_null', 'discount_percent_layer_nan',
'discount_limit_layer_10', 'discount_limit_layer_20',
'discount_limit_layer_30', 'discount_limit_layer_100',
'discount_limit_layer_200', 'discount_limit_layer_300',
'discount_limit_layer_null', 'discount_limit_layer_nan'],
dtype='object')
import numpy as np
import matplotlib. pyplot as plt
import pandas as pd
from sklearn. ensemble import RandomForestClassifier
from sklearn. metrics import roc_auc_score
from sklearn. model_selection import GridSearchCV
from sklearn. model_selection import StratifiedShuffleSplit
from sklearn. preprocessing import Imputer
from sklearn. linear_model. logistic import LogisticRegression
from sklearn import tree
import warnings
warnings. filterwarnings( 'ignore' )
from sklearn. model_selection import StratifiedKFold
import lightgbm as lgb
from sklearn. cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( x_train, y_train, test_size= 0.3 , random_state= 0 )
def computeAUC ( y_true, y_score) :
auc = roc_auc_score( y_true, y_score)
print ( "auc=" , auc)
return auc
rf = RandomForestClassifier( n_estimators= 100 ,
max_features= 4 ,
oob_score= True ,
min_samples_split= 4 ,
min_samples_leaf= 50 ,
n_jobs= - 1 ,
class_weight= 'balanced_subsample' ,
bootstrap= True )
param_grid = { "n_estimators" : [ 100 ] , "min_samples_split" : [ 4 ] }
grid_search = GridSearchCV( rf, cv= 1 , scoring= 'roc_auc' , param_grid= param_grid, iid= False )
grid_search. fit( X_train, Y_train)
print ( "the best parameter:" , grid_search. best_params_)
print ( "the best score:" , grid_search. best_score_)
the best parameter: {'min_samples_split': 4, 'n_estimators': 100}
the best score: 0.9983629720690491
predicted_probs_train = grid_search. predict_proba( X_train)
predicted_probs_train = [ x[ 1 ] for x in predicted_probs_train]
computeAUC( Y_train, predicted_probs_train)
predicted_probs_test_new = grid_search. predict_proba( X_test)
predicted_probs_test_new = [ x[ 1 ] for x in predicted_probs_test_new]
computeAUC( Y_test, predicted_probs_test_new)
auc= 0.998383275512854
auc= 0.998349975332741
0.998349975332741
result= rf. predict_proba( x_test) [ : , 1 ]
predict= result
result= pd. read_csv( "G:\TianChio2o\\ccf_offline_stage1_test_revised.csv" )
result[ "Probability" ] = predict
result= result. drop( [ "Merchant_id" , "Discount_rate" , "Distance" ] , axis= 1 )
result[ "Probability" ] = result[ "Probability" ] . apply ( lambda x: 0 if x< 0 else x)
file = open ( "G:\TianChio2o\sample_submission.csv" , "w" )
result. to_csv( file )
result. head( )
User_id Coupon_id Date_received Probability 0 4129537 9983 20160712 0.027536 1 6949378 3429 20160706 0.024217 2 2166529 6928 20160727 0.025548 3 2166529 1808 20160727 0.000000 4 6172162 6500 20160708 0.028085