import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import f_classif, SelectFromModel, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier as RFC
%matplotlib inline
plt.rcParams['font.family'] = ['SimHei'] # 显示中文,解决图中无法显示中文的问题
plt.rcParams['axes.unicode_minus']=False
# 读取文件
df = pd.read_table('userlostprob.txt')
# 查看头五行
df.head()
| label | sampleid | d | arrival | iforderpv_24h | decisionhabit_user | historyvisit_7ordernum | historyvisit_totalordernum | hotelcr | ordercanceledprecent | ... | lowestprice_pre2 | lasthtlordergap | businessrate_pre2 | cityuvs | cityorders | lastpvgap | cr | sid | visitnum_oneyear | h | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 24636 | 2016-05-18 | 2016-05-18 | 0 | NaN | NaN | NaN | 1.04 | NaN | ... | 615.0 | NaN | 0.29 | 12.880 | 3.147 | NaN | NaN | 7 | NaN | 12 |
| 1 | 1 | 24637 | 2016-05-18 | 2016-05-18 | 0 | NaN | NaN | NaN | 1.06 | NaN | ... | 513.0 | NaN | 0.53 | 17.933 | 4.913 | NaN | NaN | 33 | NaN | 14 |
| 2 | 0 | 24641 | 2016-05-18 | 2016-05-19 | 0 | NaN | NaN | NaN | 1.05 | NaN | ... | 382.0 | NaN | 0.60 | 3.993 | 0.760 | NaN | NaN | 10 | NaN | 19 |
| 3 | 0 | 24642 | 2016-05-18 | 2016-05-18 | 0 | NaN | NaN | NaN | 1.01 | NaN | ... | 203.0 | NaN | 0.18 | 3.220 | 0.660 | NaN | NaN | 8 | NaN | 16 |
| 4 | 1 | 24644 | 2016-05-18 | 2016-05-19 | 0 | NaN | NaN | NaN | 1.00 | NaN | ... | 84.0 | NaN | NaN | 0.013 | NaN | NaN | NaN | 1 | NaN | 21 |
5 rows × 51 columns
# 观察标签分布状况
df['label'].value_counts()
0 500588
1 189357
Name: label, dtype: int64
# 查看后五行
df.tail()
| label | sampleid | d | arrival | iforderpv_24h | decisionhabit_user | historyvisit_7ordernum | historyvisit_totalordernum | hotelcr | ordercanceledprecent | ... | lowestprice_pre2 | lasthtlordergap | businessrate_pre2 | cityuvs | cityorders | lastpvgap | cr | sid | visitnum_oneyear | h | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 689940 | 1 | 2238419 | 2016-05-15 | 2016-05-17 | 1 | 19.0 | NaN | NaN | 1.06 | NaN | ... | 406.0 | NaN | 0.48 | 13.573 | 1.660 | 1034.0 | 1.0 | 5 | 119.0 | 18 |
| 689941 | 1 | 2238421 | 2016-05-15 | 2016-05-15 | 1 | 10.0 | 3.0 | 3.0 | 1.06 | 0.33 | ... | 199.0 | 713.0 | 0.51 | 2.880 | 0.513 | 179.0 | 2.0 | 15 | 1472.0 | 12 |
| 689942 | 0 | 2238422 | 2016-05-15 | 2016-05-17 | 0 | NaN | NaN | NaN | 1.07 | NaN | ... | 544.0 | NaN | 0.45 | 15.293 | 2.067 | 0.0 | NaN | 8 | 107.0 | 0 |
| 689943 | 0 | 2238425 | 2016-05-15 | 2016-05-17 | 0 | NaN | NaN | NaN | 1.04 | NaN | ... | 156.0 | NaN | 0.29 | 2.467 | 0.333 | NaN | NaN | 4 | NaN | 0 |
| 689944 | 0 | 2238426 | 2016-05-15 | 2016-05-15 | 0 | NaN | NaN | NaN | 1.02 | NaN | ... | 275.0 | NaN | NaN | 12.600 | 2.653 | NaN | NaN | 2 | NaN | 11 |
5 rows × 51 columns
# 随机查看五行
df.sample(5)
| label | sampleid | d | arrival | iforderpv_24h | decisionhabit_user | historyvisit_7ordernum | historyvisit_totalordernum | hotelcr | ordercanceledprecent | ... | lowestprice_pre2 | lasthtlordergap | businessrate_pre2 | cityuvs | cityorders | lastpvgap | cr | sid | visitnum_oneyear | h | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 477013 | 1 | 820235 | 2016-05-21 | 2016-05-21 | 0 | 15.0 | NaN | 15.0 | 1.05 | 0.36 | ... | 582.0 | 18831.0 | 0.48 | 17.220 | 3.400 | 4242.0 | 1.33 | 446 | 906.0 | 9 |
| 426926 | 0 | 736598 | 2016-05-15 | 2016-05-15 | 0 | 1.0 | NaN | 39.0 | 1.05 | 0.16 | ... | 978.0 | 12199.0 | 0.13 | 5.113 | 0.847 | 642.0 | 1.36 | 732 | 2583.0 | 8 |
| 628554 | 0 | 1072402 | 2016-05-20 | 2016-05-20 | 0 | NaN | NaN | 3.0 | 1.02 | 0.00 | ... | 147.0 | 55214.0 | 0.27 | 15.873 | 3.220 | 10002.0 | 1.11 | 186 | 905.0 | 19 |
| 248275 | 0 | 438633 | 2016-05-18 | 2016-06-09 | 0 | 19.0 | 2.0 | 28.0 | 1.02 | 0.78 | ... | NaN | 3329.0 | NaN | 1.320 | 0.087 | 145.0 | 1.12 | 449 | 17397.0 | 11 |
| 198972 | 0 | 356550 | 2016-05-19 | 2016-05-19 | 0 | 7.0 | NaN | 2.0 | 1.04 | 0.50 | ... | 206.0 | 61467.0 | 0.32 | 20.480 | 5.153 | 13264.0 | 1.08 | 59 | 1522.0 | 20 |
5 rows × 51 columns
# 数据形状
df.shape
(689945, 51)
# 查看数据类型
df.dtypes
label int64
sampleid int64
d object
arrival object
iforderpv_24h int64
decisionhabit_user float64
historyvisit_7ordernum float64
historyvisit_totalordernum float64
hotelcr float64
ordercanceledprecent float64
landhalfhours float64
ordercanncelednum float64
commentnums float64
starprefer float64
novoters float64
consuming_capacity float64
historyvisit_avghotelnum float64
cancelrate float64
historyvisit_visit_detailpagenum float64
delta_price1 float64
price_sensitive float64
hoteluv float64
businessrate_pre float64
ordernum_oneyear float64
cr_pre float64
avgprice float64
lowestprice float64
firstorder_bu float64
customereval_pre2 float64
delta_price2 float64
commentnums_pre float64
customer_value_profit float64
commentnums_pre2 float64
cancelrate_pre float64
novoters_pre2 float64
novoters_pre float64
ctrip_profits float64
deltaprice_pre2_t1 float64
lowestprice_pre float64
uv_pre float64
uv_pre2 float64
lowestprice_pre2 float64
lasthtlordergap float64
businessrate_pre2 float64
cityuvs float64
cityorders float64
lastpvgap float64
cr float64
sid int64
visitnum_oneyear float64
h int64
dtype: object
# 查看数据基本信息
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689945 entries, 0 to 689944
Data columns (total 51 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 label 689945 non-null int64
1 sampleid 689945 non-null int64
2 d 689945 non-null object
3 arrival 689945 non-null object
4 iforderpv_24h 689945 non-null int64
5 decisionhabit_user 385450 non-null float64
6 historyvisit_7ordernum 82915 non-null float64
7 historyvisit_totalordernum 386525 non-null float64
8 hotelcr 689148 non-null float64
9 ordercanceledprecent 447831 non-null float64
10 landhalfhours 661312 non-null float64
11 ordercanncelednum 447831 non-null float64
12 commentnums 622029 non-null float64
13 starprefer 464892 non-null float64
14 novoters 672918 non-null float64
15 consuming_capacity 463837 non-null float64
16 historyvisit_avghotelnum 387876 non-null float64
17 cancelrate 678227 non-null float64
18 historyvisit_visit_detailpagenum 307234 non-null float64
19 delta_price1 437146 non-null float64
20 price_sensitive 463837 non-null float64
21 hoteluv 689148 non-null float64
22 businessrate_pre 483896 non-null float64
23 ordernum_oneyear 447831 non-null float64
24 cr_pre 660548 non-null float64
25 avgprice 457261 non-null float64
26 lowestprice 687931 non-null float64
27 firstorder_bu 376993 non-null float64
28 customereval_pre2 661312 non-null float64
29 delta_price2 437750 non-null float64
30 commentnums_pre 598368 non-null float64
31 customer_value_profit 439123 non-null float64
32 commentnums_pre2 648457 non-null float64
33 cancelrate_pre 653015 non-null float64
34 novoters_pre2 657616 non-null float64
35 novoters_pre 648956 non-null float64
36 ctrip_profits 445187 non-null float64
37 deltaprice_pre2_t1 543180 non-null float64
38 lowestprice_pre 659689 non-null float64
39 uv_pre 660548 non-null float64
40 uv_pre2 661189 non-null float64
41 lowestprice_pre2 660664 non-null float64
42 lasthtlordergap 447831 non-null float64
43 businessrate_pre2 602960 non-null float64
44 cityuvs 682274 non-null float64
45 cityorders 651263 non-null float64
46 lastpvgap 592818 non-null float64
47 cr 457896 non-null float64
48 sid 689945 non-null int64
49 visitnum_oneyear 592910 non-null float64
50 h 689945 non-null int64
dtypes: float64(44), int64(5), object(2)
memory usage: 268.5+ MB
# 描述性统计
df.describe([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99])
| label | sampleid | iforderpv_24h | decisionhabit_user | historyvisit_7ordernum | historyvisit_totalordernum | hotelcr | ordercanceledprecent | landhalfhours | ordercanncelednum | ... | lowestprice_pre2 | lasthtlordergap | businessrate_pre2 | cityuvs | cityorders | lastpvgap | cr | sid | visitnum_oneyear | h | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 689945.000000 | 6.899450e+05 | 689945.000000 | 385450.000000 | 82915.000000 | 386525.000000 | 689148.000000 | 447831.000000 | 661312.000000 | 447831.000000 | ... | 660664.000000 | 447831.000000 | 602960.000000 | 682274.000000 | 651263.000000 | 592818.000000 | 457896.000000 | 689945.000000 | 5.929100e+05 | 689945.000000 |
| mean | 0.274452 | 6.285402e+05 | 0.193737 | 5.317048 | 1.856094 | 11.710487 | 1.060996 | 0.342119 | 6.086366 | 154.179369 | ... | 318.541812 | 101830.919400 | 0.368237 | 10.648278 | 2.253250 | 12049.409382 | 1.137476 | 153.702414 | 1.855185e+04 | 14.462315 |
| std | 0.446238 | 4.146815e+05 | 0.395226 | 38.524483 | 2.103862 | 17.251429 | 0.045264 | 0.354210 | 12.413225 | 398.456986 | ... | 351.913035 | 122784.313864 | 0.219945 | 15.696682 | 3.538453 | 25601.374138 | 0.204789 | 277.807697 | 2.288603e+05 | 6.301575 |
| min | 0.000000 | 2.463600e+04 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 1.000000 | 0.000000 | 0.000000 | 0.007000 | 0.007000 | 0.000000 | 1.000000 | 0.000000 | 1.000000e+00 | 0.000000 |
| 1% | 0.000000 | 3.620588e+04 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 52.000000 | 244.000000 | 0.010000 | 0.013000 | 0.007000 | 0.000000 | 1.000000 | 1.000000 | 2.100000e+01 | 0.000000 |
| 10% | 0.000000 | 1.398464e+05 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.010000 | 0.000000 | 0.000000 | 0.000000 | ... | 101.000000 | 3518.000000 | 0.050000 | 0.160000 | 0.033000 | 127.000000 | 1.000000 | 4.000000 | 1.610000e+02 | 6.000000 |
| 25% | 0.000000 | 3.123200e+05 | 0.000000 | 2.000000 | 1.000000 | 2.000000 | 1.030000 | 0.000000 | 0.000000 | 0.000000 | ... | 145.000000 | 14999.000000 | 0.170000 | 0.827000 | 0.127000 | 551.000000 | 1.000000 | 17.000000 | 4.710000e+02 | 11.000000 |
| 50% | 0.000000 | 5.996370e+05 | 0.000000 | 3.000000 | 1.000000 | 6.000000 | 1.050000 | 0.250000 | 0.000000 | 2.000000 | ... | 233.000000 | 46890.000000 | 0.400000 | 3.527000 | 0.627000 | 2848.000000 | 1.050000 | 62.000000 | 1.315000e+03 | 15.000000 |
| 75% | 1.000000 | 8.874600e+05 | 0.000000 | 5.000000 | 2.000000 | 14.000000 | 1.090000 | 0.570000 | 4.000000 | 153.000000 | ... | 388.000000 | 138953.000000 | 0.550000 | 13.327000 | 2.747000 | 10726.000000 | 1.210000 | 180.000000 | 3.141000e+03 | 20.000000 |
| 90% | 1.000000 | 1.059705e+06 | 1.000000 | 10.000000 | 3.000000 | 29.000000 | 1.120000 | 0.980000 | 27.000000 | 492.000000 | ... | 611.000000 | 311492.000000 | 0.650000 | 35.567000 | 7.547000 | 30384.900000 | 1.400000 | 392.000000 | 6.634000e+03 | 22.000000 |
| 99% | 1.000000 | 2.226893e+06 | 1.000000 | 27.000000 | 7.000000 | 82.000000 | 1.190000 | 1.000000 | 48.000000 | 1752.000000 | ... | 1464.000000 | 484734.000000 | 0.780000 | 66.007000 | 14.453000 | 138722.000000 | 2.000000 | 1212.000000 | 2.625670e+05 | 23.000000 |
| max | 1.000000 | 2.238426e+06 | 1.000000 | 3167.000000 | 106.000000 | 711.000000 | 3.180000 | 1.000000 | 49.000000 | 13475.000000 | ... | 43700.000000 | 527026.000000 | 0.990000 | 67.140000 | 14.507000 | 194386.000000 | 11.000000 | 9956.000000 | 9.651192e+06 | 23.000000 |
12 rows × 49 columns
# 删除重复值
df.drop_duplicates(inplace=True)
df.shape
(689945, 51)
# 根据缺失值比例进行排序
null = df.isnull().mean().reset_index().sort_values(0)
null_1 = null.rename(columns={
'index':'特征', 0:'缺失比'})
null_1
| 特征 | 缺失比 | |
|---|---|---|
| 0 | label | 0.000000 |
| 48 | sid | 0.000000 |
| 4 | iforderpv_24h | 0.000000 |
| 50 | h | 0.000000 |
| 2 | d | 0.000000 |
| 1 | sampleid | 0.000000 |
| 3 | arrival | 0.000000 |
| 8 | hotelcr | 0.001155 |
| 21 | hoteluv | 0.001155 |
| 26 | lowestprice | 0.002919 |
| 44 | cityuvs | 0.011118 |
| 17 | cancelrate | 0.016984 |
| 14 | novoters | 0.024679 |
| 28 | customereval_pre2 | 0.041500 |
| 10 | landhalfhours | 0.041500 |
| 40 | uv_pre2 | 0.041679 |
| 41 | lowestprice_pre2 | 0.042440 |
| 39 | uv_pre | 0.042608 |
| 24 | cr_pre | 0.042608 |
| 38 | lowestprice_pre | 0.043853 |
| 34 | novoters_pre2 | 0.046857 |
| 33 | cancelrate_pre | 0.053526 |
| 45 | cityorders | 0.056065 |
| 35 | novoters_pre | 0.059409 |
| 32 | commentnums_pre2 | 0.060132 |
| 12 | commentnums | 0.098437 |
| 43 | businessrate_pre2 | 0.126075 |
| 30 | commentnums_pre | 0.132731 |
| 49 | visitnum_oneyear | 0.140642 |
| 46 | lastpvgap | 0.140775 |
| 37 | deltaprice_pre2_t1 | 0.212720 |
| 22 | businessrate_pre | 0.298646 |
| 13 | starprefer | 0.326190 |
| 20 | price_sensitive | 0.327719 |
| 15 | consuming_capacity | 0.327719 |
| 47 | cr | 0.336330 |
| 25 | avgprice | 0.337250 |
| 23 | ordernum_oneyear | 0.350918 |
| 42 | lasthtlordergap | 0.350918 |
| 11 | ordercanncelednum | 0.350918 |
| 9 | ordercanceledprecent | 0.350918 |
| 36 | ctrip_profits | 0.354750 |
| 31 | customer_value_profit | 0.363539 |
| 29 | delta_price2 | 0.365529 |
| 19 | delta_price1 | 0.366405 |
| 16 | historyvisit_avghotelnum | 0.437816 |
| 7 | historyvisit_totalordernum | 0.439774 |
| 5 | decisionhabit_user | 0.441332 |
| 27 | firstorder_bu | 0.453590 |
| 18 | historyvisit_visit_detailpagenum | 0.554698 |
| 6 | historyvisit_7ordernum | 0.879824 |
# 绘制密度图
plt.figure(figsize=(8,6))
sns.kdeplot(null_1['缺失比'], shade=True)

# 用条形图观察缺失值
plt.figure(figsize=(8,6))
plt.bar(range(null_1.shape[0]), null_1['缺失比'], label='lost rate')
plt.legend(loc='best')

# 删除缺失值过多的列
df = df.drop(['historyvisit_7ordernum'], axis=1)
df
| label | sampleid | d | arrival | iforderpv_24h | decisionhabit_user | historyvisit_totalordernum | hotelcr | ordercanceledprecent | landhalfhours | ... | lowestprice_pre2 | lasthtlordergap | businessrate_pre2 | cityuvs | cityorders | lastpvgap | cr | sid | visitnum_oneyear | h | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 24636 | 2016-05-18 | 2016-05-18 | 0 | NaN | NaN | 1.04 | NaN | 22.0 | ... | 615.0 | NaN | 0.29 | 12.880 | 3.147 | NaN | NaN | 7 | NaN | 12 |
| 1 | 1 | 24637 | 2016-05-18 | 2016-05-18 | 0 | NaN | NaN | 1.06 | NaN | 0.0 | ... | 513.0 | NaN | 0.53 | 17.933 | 4.913 | NaN | NaN | 33 | NaN | 14 |
| 2 | 0 | 24641 | 2016-05-18 | 2016-05-19 | 0 | NaN | NaN | 1.05 | NaN | 3.0 | ... | 382.0 | NaN | 0.60 | 3.993 | 0.760 | NaN | NaN | 10 | NaN | 19 |
| 3 | 0 | 24642 | 2016-05-18 | 2016-05-18 | 0 | NaN | NaN | 1.01 | NaN | 2.0 | ... | 203.0 | NaN | 0.18 | 3.220 | 0.660 | NaN | NaN | 8 | NaN | 16 |
| 4 | 1 | 24644 | 2016-05-18 | 2016-05-19 | 0 | NaN | NaN | 1.00 | NaN | 0.0 | ... | 84.0 | NaN | NaN | 0.013 | NaN | NaN | NaN | 1 | NaN | 21 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 689940 | 1 | 2238419 | 2016-05-15 | 2016-05-17 | 1 | 19.0 | NaN | 1.06 | NaN | 1.0 | ... | 406.0 | NaN | 0.48 | 13.573 | 1.660 | 1034.0 | 1.0 | 5 | 119.0 | 18 |
| 689941 | 1 | 2238421 | 2016-05-15 | 2016-05-15 | 1 | 10.0 | 3.0 | 1.06 | 0.33 | 49.0 | ... | 199.0 | 713.0 | 0.51 | 2.880 | 0.513 | 179.0 | 2.0 | 15 | 1472.0 | 12 |
| 689942 | 0 | 2238422 | 2016-05-15 | 2016-05-17 | 0 | NaN | NaN | 1.07 | NaN | 0.0 | ... | 544.0 | NaN | 0.45 | 15.293 | 2.067 | 0.0 | NaN | 8 | 107.0 | 0 |
| 689943 | 0 | 2238425 | 2016-05-15 | 2016-05-17 | 0 | NaN | NaN | 1.04 | NaN | 0.0 | ... | 156.0 | NaN | 0.29 | 2.467 | 0.333 | NaN | NaN | 4 | NaN | 0 |
| 689944 | 0 | 2238426 | 2016-05-15 | 2016-05-15 | 0 | NaN | NaN | 1.02 | NaN | 0.0 | ... | 275.0 | NaN | NaN | 12.600 | 2.653 | NaN | NaN | 2 | NaN | 11 |
689945 rows × 50 columns
# 异常值观察
df.describe([0.01, 0.25, 0.5, 0.75, 0.99]).T
| count | mean | std | min | 1% | 25% | 50% | 75% | 99% | max | |
|---|---|---|---|---|---|---|---|---|---|---|
| label | 689945.0 | 0.274452 | 0.446238 | 0.000 | 0.00000 | 0.000 | 0.000 | 1.000 | 1.000000e+00 | 1.000 |
| sampleid | 689945.0 | 628540.209625 | 414681.498697 | 24636.000 | 36205.88000 | 312320.000 | 599637.000 | 887460.000 | 2.226893e+06 | 2238426.000 |
| iforderpv_24h | 689945.0 | 0.193737 | 0.395226 | 0.000 | 0.00000 | 0.000 | 0.000 | 0.000 | 1.000000e+00 | 1.000 |
| decisionhabit_user | 385450.0 | 5.317048 | 38.524483 | 0.000 | 1.00000 | 2.000 | 3.000 | 5.000 | 2.700000e+01 | 3167.000 |
| historyvisit_totalordernum | 386525.0 | 11.710487 | 17.251429 | 1.000 | 1.00000 | 2.000 | 6.000 | 14.000 | 8.200000e+01 | 711.000 |
这是一个关于携程客户流失分析的个人练习项目,涉及到大量数据处理和机器学习模型的运用。通过对不同规模的数据集(从5行到数十万行不等,包含50列以上的信息)进行探索和建模,旨在预测并理解客户流失的原因和模式,从而为业务提供决策支持。
最低0.47元/天 解锁文章
2809

被折叠的 条评论
为什么被折叠?



