携程客户流失分析项目(个人练习+源代码)

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from time import time
import datetime
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import f_classif, SelectFromModel, VarianceThreshold
from sklearn.ensemble import RandomForestClassifier as RFC
%matplotlib inline

plt.rcParams['font.family'] = ['SimHei']    # 显示中文,解决图中无法显示中文的问题
plt.rcParams['axes.unicode_minus']=False
# 读取文件
df = pd.read_table('userlostprob.txt')
# 查看头五行
df.head()
label sampleid d arrival iforderpv_24h decisionhabit_user historyvisit_7ordernum historyvisit_totalordernum hotelcr ordercanceledprecent ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h
0 0 24636 2016-05-18 2016-05-18 0 NaN NaN NaN 1.04 NaN ... 615.0 NaN 0.29 12.880 3.147 NaN NaN 7 NaN 12
1 1 24637 2016-05-18 2016-05-18 0 NaN NaN NaN 1.06 NaN ... 513.0 NaN 0.53 17.933 4.913 NaN NaN 33 NaN 14
2 0 24641 2016-05-18 2016-05-19 0 NaN NaN NaN 1.05 NaN ... 382.0 NaN 0.60 3.993 0.760 NaN NaN 10 NaN 19
3 0 24642 2016-05-18 2016-05-18 0 NaN NaN NaN 1.01 NaN ... 203.0 NaN 0.18 3.220 0.660 NaN NaN 8 NaN 16
4 1 24644 2016-05-18 2016-05-19 0 NaN NaN NaN 1.00 NaN ... 84.0 NaN NaN 0.013 NaN NaN NaN 1 NaN 21

5 rows × 51 columns

# 观察标签分布状况
df['label'].value_counts()
0    500588
1    189357
Name: label, dtype: int64
# 查看后五行
df.tail()
label sampleid d arrival iforderpv_24h decisionhabit_user historyvisit_7ordernum historyvisit_totalordernum hotelcr ordercanceledprecent ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h
689940 1 2238419 2016-05-15 2016-05-17 1 19.0 NaN NaN 1.06 NaN ... 406.0 NaN 0.48 13.573 1.660 1034.0 1.0 5 119.0 18
689941 1 2238421 2016-05-15 2016-05-15 1 10.0 3.0 3.0 1.06 0.33 ... 199.0 713.0 0.51 2.880 0.513 179.0 2.0 15 1472.0 12
689942 0 2238422 2016-05-15 2016-05-17 0 NaN NaN NaN 1.07 NaN ... 544.0 NaN 0.45 15.293 2.067 0.0 NaN 8 107.0 0
689943 0 2238425 2016-05-15 2016-05-17 0 NaN NaN NaN 1.04 NaN ... 156.0 NaN 0.29 2.467 0.333 NaN NaN 4 NaN 0
689944 0 2238426 2016-05-15 2016-05-15 0 NaN NaN NaN 1.02 NaN ... 275.0 NaN NaN 12.600 2.653 NaN NaN 2 NaN 11

5 rows × 51 columns

# 随机查看五行
df.sample(5)
label sampleid d arrival iforderpv_24h decisionhabit_user historyvisit_7ordernum historyvisit_totalordernum hotelcr ordercanceledprecent ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h
477013 1 820235 2016-05-21 2016-05-21 0 15.0 NaN 15.0 1.05 0.36 ... 582.0 18831.0 0.48 17.220 3.400 4242.0 1.33 446 906.0 9
426926 0 736598 2016-05-15 2016-05-15 0 1.0 NaN 39.0 1.05 0.16 ... 978.0 12199.0 0.13 5.113 0.847 642.0 1.36 732 2583.0 8
628554 0 1072402 2016-05-20 2016-05-20 0 NaN NaN 3.0 1.02 0.00 ... 147.0 55214.0 0.27 15.873 3.220 10002.0 1.11 186 905.0 19
248275 0 438633 2016-05-18 2016-06-09 0 19.0 2.0 28.0 1.02 0.78 ... NaN 3329.0 NaN 1.320 0.087 145.0 1.12 449 17397.0 11
198972 0 356550 2016-05-19 2016-05-19 0 7.0 NaN 2.0 1.04 0.50 ... 206.0 61467.0 0.32 20.480 5.153 13264.0 1.08 59 1522.0 20

5 rows × 51 columns

# 数据形状
df.shape
(689945, 51)
# 查看数据类型
df.dtypes
label                                 int64
sampleid                              int64
d                                    object
arrival                              object
iforderpv_24h                         int64
decisionhabit_user                  float64
historyvisit_7ordernum              float64
historyvisit_totalordernum          float64
hotelcr                             float64
ordercanceledprecent                float64
landhalfhours                       float64
ordercanncelednum                   float64
commentnums                         float64
starprefer                          float64
novoters                            float64
consuming_capacity                  float64
historyvisit_avghotelnum            float64
cancelrate                          float64
historyvisit_visit_detailpagenum    float64
delta_price1                        float64
price_sensitive                     float64
hoteluv                             float64
businessrate_pre                    float64
ordernum_oneyear                    float64
cr_pre                              float64
avgprice                            float64
lowestprice                         float64
firstorder_bu                       float64
customereval_pre2                   float64
delta_price2                        float64
commentnums_pre                     float64
customer_value_profit               float64
commentnums_pre2                    float64
cancelrate_pre                      float64
novoters_pre2                       float64
novoters_pre                        float64
ctrip_profits                       float64
deltaprice_pre2_t1                  float64
lowestprice_pre                     float64
uv_pre                              float64
uv_pre2                             float64
lowestprice_pre2                    float64
lasthtlordergap                     float64
businessrate_pre2                   float64
cityuvs                             float64
cityorders                          float64
lastpvgap                           float64
cr                                  float64
sid                                   int64
visitnum_oneyear                    float64
h                                     int64
dtype: object
# 查看数据基本信息
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 689945 entries, 0 to 689944
Data columns (total 51 columns):
 #   Column                            Non-Null Count   Dtype  
---  ------                            --------------   -----  
 0   label                             689945 non-null  int64  
 1   sampleid                          689945 non-null  int64  
 2   d                                 689945 non-null  object 
 3   arrival                           689945 non-null  object 
 4   iforderpv_24h                     689945 non-null  int64  
 5   decisionhabit_user                385450 non-null  float64
 6   historyvisit_7ordernum            82915 non-null   float64
 7   historyvisit_totalordernum        386525 non-null  float64
 8   hotelcr                           689148 non-null  float64
 9   ordercanceledprecent              447831 non-null  float64
 10  landhalfhours                     661312 non-null  float64
 11  ordercanncelednum                 447831 non-null  float64
 12  commentnums                       622029 non-null  float64
 13  starprefer                        464892 non-null  float64
 14  novoters                          672918 non-null  float64
 15  consuming_capacity                463837 non-null  float64
 16  historyvisit_avghotelnum          387876 non-null  float64
 17  cancelrate                        678227 non-null  float64
 18  historyvisit_visit_detailpagenum  307234 non-null  float64
 19  delta_price1                      437146 non-null  float64
 20  price_sensitive                   463837 non-null  float64
 21  hoteluv                           689148 non-null  float64
 22  businessrate_pre                  483896 non-null  float64
 23  ordernum_oneyear                  447831 non-null  float64
 24  cr_pre                            660548 non-null  float64
 25  avgprice                          457261 non-null  float64
 26  lowestprice                       687931 non-null  float64
 27  firstorder_bu                     376993 non-null  float64
 28  customereval_pre2                 661312 non-null  float64
 29  delta_price2                      437750 non-null  float64
 30  commentnums_pre                   598368 non-null  float64
 31  customer_value_profit             439123 non-null  float64
 32  commentnums_pre2                  648457 non-null  float64
 33  cancelrate_pre                    653015 non-null  float64
 34  novoters_pre2                     657616 non-null  float64
 35  novoters_pre                      648956 non-null  float64
 36  ctrip_profits                     445187 non-null  float64
 37  deltaprice_pre2_t1                543180 non-null  float64
 38  lowestprice_pre                   659689 non-null  float64
 39  uv_pre                            660548 non-null  float64
 40  uv_pre2                           661189 non-null  float64
 41  lowestprice_pre2                  660664 non-null  float64
 42  lasthtlordergap                   447831 non-null  float64
 43  businessrate_pre2                 602960 non-null  float64
 44  cityuvs                           682274 non-null  float64
 45  cityorders                        651263 non-null  float64
 46  lastpvgap                         592818 non-null  float64
 47  cr                                457896 non-null  float64
 48  sid                               689945 non-null  int64  
 49  visitnum_oneyear                  592910 non-null  float64
 50  h                                 689945 non-null  int64  
dtypes: float64(44), int64(5), object(2)
memory usage: 268.5+ MB
# 描述性统计
df.describe([0.01, 0.1, 0.25, 0.5, 0.75, 0.9, 0.99])
label sampleid iforderpv_24h decisionhabit_user historyvisit_7ordernum historyvisit_totalordernum hotelcr ordercanceledprecent landhalfhours ordercanncelednum ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h
count 689945.000000 6.899450e+05 689945.000000 385450.000000 82915.000000 386525.000000 689148.000000 447831.000000 661312.000000 447831.000000 ... 660664.000000 447831.000000 602960.000000 682274.000000 651263.000000 592818.000000 457896.000000 689945.000000 5.929100e+05 689945.000000
mean 0.274452 6.285402e+05 0.193737 5.317048 1.856094 11.710487 1.060996 0.342119 6.086366 154.179369 ... 318.541812 101830.919400 0.368237 10.648278 2.253250 12049.409382 1.137476 153.702414 1.855185e+04 14.462315
std 0.446238 4.146815e+05 0.395226 38.524483 2.103862 17.251429 0.045264 0.354210 12.413225 398.456986 ... 351.913035 122784.313864 0.219945 15.696682 3.538453 25601.374138 0.204789 277.807697 2.288603e+05 6.301575
min 0.000000 2.463600e+04 0.000000 0.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 ... 1.000000 0.000000 0.000000 0.007000 0.007000 0.000000 1.000000 0.000000 1.000000e+00 0.000000
1% 0.000000 3.620588e+04 0.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 ... 52.000000 244.000000 0.010000 0.013000 0.007000 0.000000 1.000000 1.000000 2.100000e+01 0.000000
10% 0.000000 1.398464e+05 0.000000 1.000000 1.000000 1.000000 1.010000 0.000000 0.000000 0.000000 ... 101.000000 3518.000000 0.050000 0.160000 0.033000 127.000000 1.000000 4.000000 1.610000e+02 6.000000
25% 0.000000 3.123200e+05 0.000000 2.000000 1.000000 2.000000 1.030000 0.000000 0.000000 0.000000 ... 145.000000 14999.000000 0.170000 0.827000 0.127000 551.000000 1.000000 17.000000 4.710000e+02 11.000000
50% 0.000000 5.996370e+05 0.000000 3.000000 1.000000 6.000000 1.050000 0.250000 0.000000 2.000000 ... 233.000000 46890.000000 0.400000 3.527000 0.627000 2848.000000 1.050000 62.000000 1.315000e+03 15.000000
75% 1.000000 8.874600e+05 0.000000 5.000000 2.000000 14.000000 1.090000 0.570000 4.000000 153.000000 ... 388.000000 138953.000000 0.550000 13.327000 2.747000 10726.000000 1.210000 180.000000 3.141000e+03 20.000000
90% 1.000000 1.059705e+06 1.000000 10.000000 3.000000 29.000000 1.120000 0.980000 27.000000 492.000000 ... 611.000000 311492.000000 0.650000 35.567000 7.547000 30384.900000 1.400000 392.000000 6.634000e+03 22.000000
99% 1.000000 2.226893e+06 1.000000 27.000000 7.000000 82.000000 1.190000 1.000000 48.000000 1752.000000 ... 1464.000000 484734.000000 0.780000 66.007000 14.453000 138722.000000 2.000000 1212.000000 2.625670e+05 23.000000
max 1.000000 2.238426e+06 1.000000 3167.000000 106.000000 711.000000 3.180000 1.000000 49.000000 13475.000000 ... 43700.000000 527026.000000 0.990000 67.140000 14.507000 194386.000000 11.000000 9956.000000 9.651192e+06 23.000000

12 rows × 49 columns

# 删除重复值
df.drop_duplicates(inplace=True)
df.shape
(689945, 51)
# 根据缺失值比例进行排序
null = df.isnull().mean().reset_index().sort_values(0)
null_1 = null.rename(columns={
   'index':'特征', 0:'缺失比'})
null_1
特征 缺失比
0 label 0.000000
48 sid 0.000000
4 iforderpv_24h 0.000000
50 h 0.000000
2 d 0.000000
1 sampleid 0.000000
3 arrival 0.000000
8 hotelcr 0.001155
21 hoteluv 0.001155
26 lowestprice 0.002919
44 cityuvs 0.011118
17 cancelrate 0.016984
14 novoters 0.024679
28 customereval_pre2 0.041500
10 landhalfhours 0.041500
40 uv_pre2 0.041679
41 lowestprice_pre2 0.042440
39 uv_pre 0.042608
24 cr_pre 0.042608
38 lowestprice_pre 0.043853
34 novoters_pre2 0.046857
33 cancelrate_pre 0.053526
45 cityorders 0.056065
35 novoters_pre 0.059409
32 commentnums_pre2 0.060132
12 commentnums 0.098437
43 businessrate_pre2 0.126075
30 commentnums_pre 0.132731
49 visitnum_oneyear 0.140642
46 lastpvgap 0.140775
37 deltaprice_pre2_t1 0.212720
22 businessrate_pre 0.298646
13 starprefer 0.326190
20 price_sensitive 0.327719
15 consuming_capacity 0.327719
47 cr 0.336330
25 avgprice 0.337250
23 ordernum_oneyear 0.350918
42 lasthtlordergap 0.350918
11 ordercanncelednum 0.350918
9 ordercanceledprecent 0.350918
36 ctrip_profits 0.354750
31 customer_value_profit 0.363539
29 delta_price2 0.365529
19 delta_price1 0.366405
16 historyvisit_avghotelnum 0.437816
7 historyvisit_totalordernum 0.439774
5 decisionhabit_user 0.441332
27 firstorder_bu 0.453590
18 historyvisit_visit_detailpagenum 0.554698
6 historyvisit_7ordernum 0.879824
# 绘制密度图
plt.figure(figsize=(8,6))
sns.kdeplot(null_1['缺失比'], shade=True)

在这里插入图片描述

# 用条形图观察缺失值
plt.figure(figsize=(8,6))
plt.bar(range(null_1.shape[0]), null_1['缺失比'], label='lost rate')
plt.legend(loc='best')

在这里插入图片描述

# 删除缺失值过多的列
df = df.drop(['historyvisit_7ordernum'], axis=1)
df
label sampleid d arrival iforderpv_24h decisionhabit_user historyvisit_totalordernum hotelcr ordercanceledprecent landhalfhours ... lowestprice_pre2 lasthtlordergap businessrate_pre2 cityuvs cityorders lastpvgap cr sid visitnum_oneyear h
0 0 24636 2016-05-18 2016-05-18 0 NaN NaN 1.04 NaN 22.0 ... 615.0 NaN 0.29 12.880 3.147 NaN NaN 7 NaN 12
1 1 24637 2016-05-18 2016-05-18 0 NaN NaN 1.06 NaN 0.0 ... 513.0 NaN 0.53 17.933 4.913 NaN NaN 33 NaN 14
2 0 24641 2016-05-18 2016-05-19 0 NaN NaN 1.05 NaN 3.0 ... 382.0 NaN 0.60 3.993 0.760 NaN NaN 10 NaN 19
3 0 24642 2016-05-18 2016-05-18 0 NaN NaN 1.01 NaN 2.0 ... 203.0 NaN 0.18 3.220 0.660 NaN NaN 8 NaN 16
4 1 24644 2016-05-18 2016-05-19 0 NaN NaN 1.00 NaN 0.0 ... 84.0 NaN NaN 0.013 NaN NaN NaN 1 NaN 21
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
689940 1 2238419 2016-05-15 2016-05-17 1 19.0 NaN 1.06 NaN 1.0 ... 406.0 NaN 0.48 13.573 1.660 1034.0 1.0 5 119.0 18
689941 1 2238421 2016-05-15 2016-05-15 1 10.0 3.0 1.06 0.33 49.0 ... 199.0 713.0 0.51 2.880 0.513 179.0 2.0 15 1472.0 12
689942 0 2238422 2016-05-15 2016-05-17 0 NaN NaN 1.07 NaN 0.0 ... 544.0 NaN 0.45 15.293 2.067 0.0 NaN 8 107.0 0
689943 0 2238425 2016-05-15 2016-05-17 0 NaN NaN 1.04 NaN 0.0 ... 156.0 NaN 0.29 2.467 0.333 NaN NaN 4 NaN 0
689944 0 2238426 2016-05-15 2016-05-15 0 NaN NaN 1.02 NaN 0.0 ... 275.0 NaN NaN 12.600 2.653 NaN NaN 2 NaN 11

689945 rows × 50 columns

# 异常值观察
df.describe([0.01, 0.25, 0.5, 0.75, 0.99]).T
count mean std min 1% 25% 50% 75% 99% max
label 689945.0 0.274452 0.446238 0.000 0.00000 0.000 0.000 1.000 1.000000e+00 1.000
sampleid 689945.0 628540.209625 414681.498697 24636.000 36205.88000 312320.000 599637.000 887460.000 2.226893e+06 2238426.000
iforderpv_24h 689945.0 0.193737 0.395226 0.000 0.00000 0.000 0.000 0.000 1.000000e+00 1.000
decisionhabit_user 385450.0 5.317048 38.524483 0.000 1.00000 2.000 3.000 5.000 2.700000e+01 3167.000
historyvisit_totalordernum 386525.0 11.710487 17.251429 1.000 1.00000 2.000 6.000 14.000 8.200000e+01 711.000
hotelcr 689148.0 1.060996 0.045264 1.000 1.00000 1.030 1.050 1.090 1.190000e+00 3.180
  • 2
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值