import pandas as pd
import numpy as np
import matplotlib. pyplot as plt
import seaborn as sns
from time import time
import datetime
from sklearn. model_selection import train_test_split, cross_val_score
from sklearn. feature_selection import f_classif, SelectFromModel, VarianceThreshold
from sklearn. ensemble import RandomForestClassifier as RFC
% matplotlib inline
plt. rcParams[ 'font.family' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
df = pd. read_table( 'userlostprob.txt' )
df. head( )
label
sampleid
d
arrival
iforderpv_24h
decisionhabit_user
historyvisit_7ordernum
historyvisit_totalordernum
hotelcr
ordercanceledprecent
...
lowestprice_pre2
lasthtlordergap
businessrate_pre2
cityuvs
cityorders
lastpvgap
cr
sid
visitnum_oneyear
h
0
0
24636
2016-05-18
2016-05-18
0
NaN
NaN
NaN
1.04
NaN
...
615.0
NaN
0.29
12.880
3.147
NaN
NaN
7
NaN
12
1
1
24637
2016-05-18
2016-05-18
0
NaN
NaN
NaN
1.06
NaN
...
513.0
NaN
0.53
17.933
4.913
NaN
NaN
33
NaN
14
2
0
24641
2016-05-18
2016-05-19
0
NaN
NaN
NaN
1.05
NaN
...
382.0
NaN
0.60
3.993
0.760
NaN
NaN
10
NaN
19
3
0
24642
2016-05-18
2016-05-18
0
NaN
NaN
NaN
1.01
NaN
...
203.0
NaN
0.18
3.220
0.660
NaN
NaN
8
NaN
16
4
1
24644
2016-05-18
2016-05-19
0
NaN
NaN
NaN
1.00
NaN
...
84.0
NaN
NaN
0.013
NaN
NaN
NaN
1
NaN
21
5 rows × 51 columns
df[ 'label' ] . value_counts( )
0 500588
1 189357
Name: label, dtype: int64
df. tail( )
label
sampleid
d
arrival
iforderpv_24h
decisionhabit_user
historyvisit_7ordernum
historyvisit_totalordernum
hotelcr
ordercanceledprecent
...
lowestprice_pre2
lasthtlordergap
businessrate_pre2
cityuvs
cityorders
lastpvgap
cr
sid
visitnum_oneyear
h
689940
1
2238419
2016-05-15
2016-05-17
1
19.0
NaN
NaN
1.06
NaN
...
406.0
NaN
0.48
13.573
1.660
1034.0
1.0
5
119.0
18
689941
1
2238421
2016-05-15
2016-05-15
1
10.0
3.0
3.0
1.06
0.33
...
199.0
713.0
0.51
2.880
0.513
179.0
2.0
15
1472.0
12
689942
0
2238422
2016-05-15
2016-05-17
0
NaN
NaN
NaN
1.07
NaN
...
544.0
NaN
0.45
15.293
2.067
0.0
NaN
8
107.0
0
689943
0
2238425
2016-05-15
2016-05-17
0
NaN
NaN
NaN
1.04
NaN
...
156.0
NaN
0.29
2.467
0.333
NaN
NaN
4
NaN
0
689944
0
2238426
2016-05-15
2016-05-15
0
NaN
NaN
NaN
1.02
NaN
...
275.0
NaN
NaN
12.600
2.653
NaN
NaN
2
NaN
11
5 rows × 51 columns
df. sample( 5 )
label
sampleid
d
arrival
iforderpv_24h
decisionhabit_user
historyvisit_7ordernum
historyvisit_totalordernum
hotelcr
ordercanceledprecent
...
lowestprice_pre2
lasthtlordergap
businessrate_pre2
cityuvs
cityorders
lastpvgap
cr
sid
visitnum_oneyear
h
477013
1
820235
2016-05-21
2016-05-21
0
15.0
NaN
15.0
1.05
0.36
...
582.0
18831.0
0.48
17.220
3.400
4242.0
1.33
446
906.0
9
426926
0
736598
2016-05-15
2016-05-15
0
1.0
NaN
39.0
1.05
0.16
...
978.0
12199.0
0.13
5.113
0.847
642.0
1.36
732
2583.0
8
628554
0
1072402
2016-05-20
2016-05-20
0
NaN
NaN
3.0
1.02
0.00
...
147.0
55214.0
0.27
15.873
3.220
10002.0
1.11
186
905.0
19
248275
0
438633
2016-05-18
2016-06-09
0
19.0
2.0
28.0
1.02
0.78
...
NaN
3329.0
NaN
1.320
0.087
145.0
1.12
449
17397.0
11
198972
0
356550
2016-05-19
2016-05-19
0
7.0
NaN
2.0
1.04
0.50
...
206.0
61467.0
0.32
20.480
5.153
13264.0
1.08
59
1522.0
20
5 rows × 51 columns
df. shape
(689945, 51)
df. dtypes
label int64
sampleid int64
d object
arrival object
iforderpv_24h int64
decisionhabit_user float64
historyvisit_7ordernum float64
historyvisit_totalordernum float64
hotelcr float64
ordercanceledprecent float64
landhalfhours float64
ordercanncelednum float64
commentnums float64
starprefer float64
novoters float64
consuming_capacity float64
historyvisit_avghotelnum float64
cancelrate float64
historyvisit_visit_detailpagenum float64
delta_price1 float64
price_sensitive float64
hoteluv float64
businessrate_pre float64
ordernum_oneyear float64
cr_pre float64
avgprice float64
lowestprice float64
firstorder_bu float64
customereval_pre2 float64
delta_price2 float64
commentnums_pre float64
customer_value_profit float64
commentnums_pre2 float64
cancelrate_pre float64
novoters_pre2 float64
novoters_pre float64
ctrip_profits float64
deltaprice_pre2_t1 float64
lowestprice_pre float64
uv_pre float64
uv_pre2 float64
lowestprice_pre2 float64
lasthtlordergap float64
businessrate_pre2 float64
cityuvs float64
cityorders float64
lastpvgap float64
cr float64
sid int64
visitnum_oneyear float64
h int64
dtype: object
df. info( )
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689945 entries, 0 to 689944
Data columns (total 51 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 label 689945 non-null int64
1 sampleid 689945 non-null int64
2 d 689945 non-null object
3 arrival 689945 non-null object
4 iforderpv_24h 689945 non-null int64
5 decisionhabit_user 385450 non-null float64
6 historyvisit_7ordernum 82915 non-null float64
7 historyvisit_totalordernum 386525 non-null float64
8 hotelcr 689148 non-null float64
9 ordercanceledprecent 447831 non-null float64
10 landhalfhours 661312 non-null float64
11 ordercanncelednum 447831 non-null float64
12 commentnums 622029 non-null float64
13 starprefer 464892 non-null float64
14 novoters 672918 non-null float64
15 consuming_capacity 463837 non-null float64
16 historyvisit_avghotelnum 387876 non-null float64
17 cancelrate 678227 non-null float64
18 historyvisit_visit_detailpagenum 307234 non-null float64
19 delta_price1 437146 non-null float64
20 price_sensitive 463837 non-null float64
21 hoteluv 689148 non-null float64
22 businessrate_pre 483896 non-null float64
23 ordernum_oneyear 447831 non-null float64
24 cr_pre 660548 non-null float64
25 avgprice 457261 non-null float64
26 lowestprice 687931 non-null float64
27 firstorder_bu 376993 non-null float64
28 customereval_pre2 661312 non-null float64
29 delta_price2 437750 non-null float64
30 commentnums_pre 598368 non-null float64
31 customer_value_profit 439123 non-null float64
32 commentnums_pre2 648457 non-null float64
33 cancelrate_pre 653015 non-null float64
34 novoters_pre2 657616 non-null float64
35 novoters_pre 648956 non-null float64
36 ctrip_profits 445187 non-null float64
37 deltaprice_pre2_t1 543180 non-null float64
38 lowestprice_pre 659689 non-null float64
39 uv_pre 660548 non-null float64
40 uv_pre2 661189 non-null float64
41 lowestprice_pre2 660664 non-null float64
42 lasthtlordergap 447831 non-null float64
43 businessrate_pre2 602960 non-null float64
44 cityuvs 682274 non-null float64
45 cityorders 651263 non-null float64
46 lastpvgap 592818 non-null float64
47 cr 457896 non-null float64
48 sid 689945 non-null int64
49 visitnum_oneyear 592910 non-null float64
50 h 689945 non-null int64
dtypes: float64(44), int64(5), object(2)
memory usage: 268.5+ MB
df. describe( [ 0.01 , 0.1 , 0.25 , 0.5 , 0.75 , 0.9 , 0.99 ] )
label
sampleid
iforderpv_24h
decisionhabit_user
historyvisit_7ordernum
historyvisit_totalordernum
hotelcr
ordercanceledprecent
landhalfhours
ordercanncelednum
...
lowestprice_pre2
lasthtlordergap
businessrate_pre2
cityuvs
cityorders
lastpvgap
cr
sid
visitnum_oneyear
h
count
689945.000000
6.899450e+05
689945.000000
385450.000000
82915.000000
386525.000000
689148.000000
447831.000000
661312.000000
447831.000000
...
660664.000000
447831.000000
602960.000000
682274.000000
651263.000000
592818.000000
457896.000000
689945.000000
5.929100e+05
689945.000000
mean
0.274452
6.285402e+05
0.193737
5.317048
1.856094
11.710487
1.060996
0.342119
6.086366
154.179369
...
318.541812
101830.919400
0.368237
10.648278
2.253250
12049.409382
1.137476
153.702414
1.855185e+04
14.462315
std
0.446238
4.146815e+05
0.395226
38.524483
2.103862
17.251429
0.045264
0.354210
12.413225
398.456986
...
351.913035
122784.313864
0.219945
15.696682
3.538453
25601.374138
0.204789
277.807697
2.288603e+05
6.301575
min
0.000000
2.463600e+04
0.000000
0.000000
1.000000
1.000000
1.000000
0.000000
0.000000
0.000000
...
1.000000
0.000000
0.000000
0.007000
0.007000
0.000000
1.000000
0.000000
1.000000e+00
0.000000
1%
0.000000
3.620588e+04
0.000000
1.000000
1.000000
1.000000
1.000000
0.000000
0.000000
0.000000
...
52.000000
244.000000
0.010000
0.013000
0.007000
0.000000
1.000000
1.000000
2.100000e+01
0.000000
10%
0.000000
1.398464e+05
0.000000
1.000000
1.000000
1.000000
1.010000
0.000000
0.000000
0.000000
...
101.000000
3518.000000
0.050000
0.160000
0.033000
127.000000
1.000000
4.000000
1.610000e+02
6.000000
25%
0.000000
3.123200e+05
0.000000
2.000000
1.000000
2.000000
1.030000
0.000000
0.000000
0.000000
...
145.000000
14999.000000
0.170000
0.827000
0.127000
551.000000
1.000000
17.000000
4.710000e+02
11.000000
50%
0.000000
5.996370e+05
0.000000
3.000000
1.000000
6.000000
1.050000
0.250000
0.000000
2.000000
...
233.000000
46890.000000
0.400000
3.527000
0.627000
2848.000000
1.050000
62.000000
1.315000e+03
15.000000
75%
1.000000
8.874600e+05
0.000000
5.000000
2.000000
14.000000
1.090000
0.570000
4.000000
153.000000
...
388.000000
138953.000000
0.550000
13.327000
2.747000
10726.000000
1.210000
180.000000
3.141000e+03
20.000000
90%
1.000000
1.059705e+06
1.000000
10.000000
3.000000
29.000000
1.120000
0.980000
27.000000
492.000000
...
611.000000
311492.000000
0.650000
35.567000
7.547000
30384.900000
1.400000
392.000000
6.634000e+03
22.000000
99%
1.000000
2.226893e+06
1.000000
27.000000
7.000000
82.000000
1.190000
1.000000
48.000000
1752.000000
...
1464.000000
484734.000000
0.780000
66.007000
14.453000
138722.000000
2.000000
1212.000000
2.625670e+05
23.000000
max
1.000000
2.238426e+06
1.000000
3167.000000
106.000000
711.000000
3.180000
1.000000
49.000000
13475.000000
...
43700.000000
527026.000000
0.990000
67.140000
14.507000
194386.000000
11.000000
9956.000000
9.651192e+06
23.000000
12 rows × 49 columns
df. drop_duplicates( inplace= True )
df. shape
(689945, 51)
null = df. isnull( ) . mean( ) . reset_index( ) . sort_values( 0 )
null_1 = null. rename( columns= {
'index' : '特征' , 0 : '缺失比' } )
null_1
特征
缺失比
0
label
0.000000
48
sid
0.000000
4
iforderpv_24h
0.000000
50
h
0.000000
2
d
0.000000
1
sampleid
0.000000
3
arrival
0.000000
8
hotelcr
0.001155
21
hoteluv
0.001155
26
lowestprice
0.002919
44
cityuvs
0.011118
17
cancelrate
0.016984
14
novoters
0.024679
28
customereval_pre2
0.041500
10
landhalfhours
0.041500
40
uv_pre2
0.041679
41
lowestprice_pre2
0.042440
39
uv_pre
0.042608
24
cr_pre
0.042608
38
lowestprice_pre
0.043853
34
novoters_pre2
0.046857
33
cancelrate_pre
0.053526
45
cityorders
0.056065
35
novoters_pre
0.059409
32
commentnums_pre2
0.060132
12
commentnums
0.098437
43
businessrate_pre2
0.126075
30
commentnums_pre
0.132731
49
visitnum_oneyear
0.140642
46
lastpvgap
0.140775
37
deltaprice_pre2_t1
0.212720
22
businessrate_pre
0.298646
13
starprefer
0.326190
20
price_sensitive
0.327719
15
consuming_capacity
0.327719
47
cr
0.336330
25
avgprice
0.337250
23
ordernum_oneyear
0.350918
42
lasthtlordergap
0.350918
11
ordercanncelednum
0.350918
9
ordercanceledprecent
0.350918
36
ctrip_profits
0.354750
31
customer_value_profit
0.363539
29
delta_price2
0.365529
19
delta_price1
0.366405
16
historyvisit_avghotelnum
0.437816
7
historyvisit_totalordernum
0.439774
5
decisionhabit_user
0.441332
27
firstorder_bu
0.453590
18
historyvisit_visit_detailpagenum
0.554698
6
historyvisit_7ordernum
0.879824
plt. figure( figsize= ( 8 , 6 ) )
sns. kdeplot( null_1[ '缺失比' ] , shade= True )
plt. figure( figsize= ( 8 , 6 ) )
plt. bar( range ( null_1. shape[ 0 ] ) , null_1[ '缺失比' ] , label= 'lost rate' )
plt. legend( loc= 'best' )
df = df. drop( [ 'historyvisit_7ordernum' ] , axis= 1 )
df
label
sampleid
d
arrival
iforderpv_24h
decisionhabit_user
historyvisit_totalordernum
hotelcr
ordercanceledprecent
landhalfhours
...
lowestprice_pre2
lasthtlordergap
businessrate_pre2
cityuvs
cityorders
lastpvgap
cr
sid
visitnum_oneyear
h
0
0
24636
2016-05-18
2016-05-18
0
NaN
NaN
1.04
NaN
22.0
...
615.0
NaN
0.29
12.880
3.147
NaN
NaN
7
NaN
12
1
1
24637
2016-05-18
2016-05-18
0
NaN
NaN
1.06
NaN
0.0
...
513.0
NaN
0.53
17.933
4.913
NaN
NaN
33
NaN
14
2
0
24641
2016-05-18
2016-05-19
0
NaN
NaN
1.05
NaN
3.0
...
382.0
NaN
0.60
3.993
0.760
NaN
NaN
10
NaN
19
3
0
24642
2016-05-18
2016-05-18
0
NaN
NaN
1.01
NaN
2.0
...
203.0
NaN
0.18
3.220
0.660
NaN
NaN
8
NaN
16
4
1
24644
2016-05-18
2016-05-19
0
NaN
NaN
1.00
NaN
0.0
...
84.0
NaN
NaN
0.013
NaN
NaN
NaN
1
NaN
21
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
...
689940
1
2238419
2016-05-15
2016-05-17
1
19.0
NaN
1.06
NaN
1.0
...
406.0
NaN
0.48
13.573
1.660
1034.0
1.0
5
119.0
18
689941
1
2238421
2016-05-15
2016-05-15
1
10.0
3.0
1.06
0.33
49.0
...
199.0
713.0
0.51
2.880
0.513
179.0
2.0
15
1472.0
12
689942
0
2238422
2016-05-15
2016-05-17
0
NaN
NaN
1.07
NaN
0.0
...
544.0
NaN
0.45
15.293
2.067
0.0
NaN
8
107.0
0
689943
0
2238425
2016-05-15
2016-05-17
0
NaN
NaN
1.04
NaN
0.0
...
156.0
NaN
0.29
2.467
0.333
NaN
NaN
4
NaN
0
689944
0
2238426
2016-05-15
2016-05-15
0
NaN
NaN
1.02
NaN
0.0
...
275.0
NaN
NaN
12.600
2.653
NaN
NaN
2
NaN
11
689945 rows × 50 columns
df. describe( [ 0.01 , 0.25 , 0.5 , 0.75 , 0.99 ] ) . T
count
mean
std
min
1%
25%
50%
75%
99%
max
label
689945.0
0.274452
0.446238
0.000
0.00000
0.000
0.000
1.000
1.000000e+00
1.000
sampleid
689945.0
628540.209625
414681.498697
24636.000
36205.88000
312320.000
599637.000
887460.000
2.226893e+06
2238426.000
iforderpv_24h
689945.0
0.193737
0.395226
0.000
0.00000
0.000
0.000
0.000
1.000000e+00
1.000
decisionhabit_user
385450.0
5.317048
38.524483
0.000
1.00000
2.000
3.000
5.000
2.700000e+01
3167.000
historyvisit_totalordernum
386525.0
11.710487
17.251429
1.000
1.00000
2.000
6.000
14.000
8.200000e+01
711.000