数据探索
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
data = pd.read_csv('./air_data.csv',index_col = 'MEMBER_NO',encoding='UTF-8')
data
FFP_DATE | FIRST_FLIGHT_DATE | GENDER | FFP_TIER | WORK_CITY | WORK_PROVINCE | WORK_COUNTRY | AGE | LOAD_TIME | FLIGHT_COUNT | ... | ADD_Point_SUM | Eli_Add_Point_Sum | L1Y_ELi_Add_Points | Points_Sum | L1Y_Points_Sum | Ration_L1Y_Flight_Count | Ration_P1Y_Flight_Count | Ration_P1Y_BPS | Ration_L1Y_BPS | Point_NotFlight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
MEMBER_NO | |||||||||||||||||||||
54993 | 2006/11/2 | 2008/12/24 | 男 | 6 | . | 北京 | CN | 31.0 | 2014/3/31 | 210 | ... | 39992 | 114452 | 111100 | 619760 | 370211 | 0.509524 | 0.490476 | 0.487221 | 0.512777 | 50 |
28065 | 2007/2/19 | 2007/8/3 | 男 | 6 | NaN | 北京 | CN | 42.0 | 2014/3/31 | 140 | ... | 12000 | 53288 | 53288 | 415768 | 238410 | 0.514286 | 0.485714 | 0.489289 | 0.510708 | 33 |
55106 | 2007/2/1 | 2007/8/30 | 男 | 6 | . | 北京 | CN | 40.0 | 2014/3/31 | 135 | ... | 15491 | 55202 | 51711 | 406361 | 233798 | 0.518519 | 0.481481 | 0.481467 | 0.518530 | 26 |
21189 | 2008/8/22 | 2008/8/23 | 男 | 5 | Los Angeles | CA | US | 64.0 | 2014/3/31 | 23 | ... | 0 | 34890 | 34890 | 372204 | 186100 | 0.434783 | 0.565217 | 0.551722 | 0.448275 | 12 |
39546 | 2009/4/10 | 2009/4/15 | 男 | 6 | 贵阳 | 贵州 | CN | 48.0 | 2014/3/31 | 152 | ... | 22704 | 64969 | 64969 | 338813 | 210365 | 0.532895 | 0.467105 | 0.469054 | 0.530943 | 39 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
18375 | 2011/5/20 | 2013/6/5 | 女 | 4 | 广州 | 广东 | CN | 25.0 | 2014/3/31 | 2 | ... | 12318 | 12318 | 12123 | 12318 | 12123 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 22 |
36041 | 2010/3/8 | 2013/9/14 | 男 | 4 | 佛山 | 广东 | CN | 38.0 | 2014/3/31 | 4 | ... | 106972 | 106972 | 56506 | 106972 | 56506 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 43 |
45690 | 2006/3/30 | 2006/12/2 | 女 | 4 | 广州 | 广东 | CN | 43.0 | 2014/3/31 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 1.000000 | 0.000000 | 0.000000 | 0.000000 | 0 |
61027 | 2013/2/6 | 2013/2/14 | 女 | 4 | 广州 | 广东 | CN | 36.0 | 2014/3/31 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0 |
61340 | 2013/2/17 | 2013/2/17 | 女 | 4 | 上海 | . | CN | 29.0 | 2014/3/31 | 2 | ... | 0 | 0 | 0 | 0 | 0 | 0.000000 | 1.000000 | 0.000000 | 0.000000 | 0 |
62988 rows × 43 columns
explore = data.describe().T #转置以后阅读更方便
# describe()函数自动计算非空值数,需要手动计算空值数
explore #结果中count是非空值个数
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
FFP_TIER | 62988.0 | 4.102162 | 0.373856 | 4.00 | 4.000000 | 4.000000 | 4.000000 | 6.000000 |
AGE | 62568.0 | 42.476346 | 9.885915 | 6.00 | 35.000000 | 41.000000 | 48.000000 | 110.000000 |
FLIGHT_COUNT | 62988.0 | 11.839414 | 14.049471 | 2.00 | 3.000000 | 7.000000 | 15.000000 | 213.000000 |
BP_SUM | 62988.0 | 10925.081254 | 16339.486151 | 0.00 | 2518.000000 | 5700.000000 | 12831.000000 | 505308.000000 |
EP_SUM_YR_1 | 62988.0 | 0.000000 | 0.000000 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
EP_SUM_YR_2 | 62988.0 | 265.689623 | 1645.702854 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 74460.000000 |
SUM_YR_1 | 62437.0 | 5355.376064 | 8109.450147 | 0.00 | 1003.000000 | 2800.000000 | 6574.000000 | 239560.000000 |
SUM_YR_2 | 62850.0 | 5604.026014 | 8703.364247 | 0.00 | 780.000000 | 2773.000000 | 6845.750000 | 234188.000000 |
SEG_KM_SUM | 62988.0 | 17123.878691 | 20960.844623 | 368.00 | 4747.000000 | 9994.000000 | 21271.250000 | 580717.000000 |
WEIGHTED_SEG_KM | 62988.0 | 12777.152439 | 17578.586695 | 0.00 | 3219.045000 | 6978.255000 | 15299.632500 | 558440.140000 |
AVG_FLIGHT_COUNT | 62988.0 | 1.542154 | 1.786996 | 0.25 | 0.428571 | 0.875000 | 1.875000 | 26.625000 |
AVG_BP_SUM | 62988.0 | 1421.440249 | 2083.121324 | 0.00 | 336.000000 | 752.375000 | 1690.270833 | 63163.500000 |
BEGIN_TO_FIRST | 62988.0 | 120.145488 | 159.572867 | 0.00 | 9.000000 | 50.000000 | 166.000000 | 729.000000 |
LAST_TO_END | 62988.0 | 176.120102 | 183.822223 | 1.00 | 29.000000 | 108.000000 | 268.000000 | 731.000000 |
AVG_INTERVAL | 62988.0 | 67.749788 | 77.517866 | 0.00 | 23.370370 | 44.666667 | 82.000000 | 728.000000 |
MAX_INTERVAL | 62988.0 | 166.033895 | 123.397180 | 0.00 | 79.000000 | 143.000000 | 228.000000 | 728.000000 |
ADD_POINTS_SUM_YR_1 | 62988.0 | 540.316965 | 3956.083455 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 600000.000000 |
ADD_POINTS_SUM_YR_2 | 62988.0 | 814.689258 | 5121.796929 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 728282.000000 |
EXCHANGE_COUNT | 62988.0 | 0.319775 | 1.136004 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 46.000000 |
avg_discount | 62988.0 | 0.721558 | 0.185427 | 0.00 | 0.611997 | 0.711856 | 0.809476 | 1.500000 |
P1Y_Flight_Count | 62988.0 | 5.766257 | 7.210922 | 0.00 | 2.000000 | 3.000000 | 7.000000 | 118.000000 |
L1Y_Flight_Count | 62988.0 | 6.073157 | 8.175127 | 0.00 | 1.000000 | 3.000000 | 8.000000 | 111.000000 |
P1Y_BP_SUM | 62988.0 | 5366.720550 | 8537.773021 | 0.00 | 946.000000 | 2692.000000 | 6485.250000 | 246197.000000 |
L1Y_BP_SUM | 62988.0 | 5558.360704 | 9351.956952 | 0.00 | 545.000000 | 2547.000000 | 6619.250000 | 259111.000000 |
EP_SUM | 62988.0 | 265.689623 | 1645.702854 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 74460.000000 |
ADD_Point_SUM | 62988.0 | 1355.006223 | 7868.477000 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 984938.000000 |
Eli_Add_Point_Sum | 62988.0 | 1620.695847 | 8294.398955 | 0.00 | 0.000000 | 0.000000 | 345.000000 | 984938.000000 |
L1Y_ELi_Add_Points | 62988.0 | 1080.378882 | 5639.857254 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 728282.000000 |
Points_Sum | 62988.0 | 12545.777100 | 20507.816700 | 0.00 | 2775.000000 | 6328.500000 | 14302.500000 | 985572.000000 |
L1Y_Points_Sum | 62988.0 | 6638.739585 | 12601.819863 | 0.00 | 700.000000 | 2860.500000 | 7500.000000 | 728282.000000 |
Ration_L1Y_Flight_Count | 62988.0 | 0.486419 | 0.319105 | 0.00 | 0.250000 | 0.500000 | 0.711111 | 1.000000 |
Ration_P1Y_Flight_Count | 62988.0 | 0.513581 | 0.319105 | 0.00 | 0.288889 | 0.500000 | 0.750000 | 1.000000 |
Ration_P1Y_BPS | 62988.0 | 0.522293 | 0.339632 | 0.00 | 0.258150 | 0.514252 | 0.815091 | 0.999989 |
Ration_L1Y_BPS | 62988.0 | 0.468422 | 0.338956 | 0.00 | 0.167954 | 0.476747 | 0.728375 | 0.999993 |
Point_NotFlight | 62988.0 | 2.728155 | 7.364164 | 0.00 | 0.000000 | 0.000000 | 1.000000 | 140.000000 |
explore['null'] = len(data) - explore['count']
explore
count | mean | std | min | 25% | 50% | 75% | max | null | |
---|---|---|---|---|---|---|---|---|---|
FFP_TIER | 62988.0 | 4.102162 | 0.373856 | 4.00 | 4.000000 | 4.000000 | 4.000000 | 6.000000 | 0.0 |
AGE | 62568.0 | 42.476346 | 9.885915 | 6.00 | 35.000000 | 41.000000 | 48.000000 | 110.000000 | 420.0 |
FLIGHT_COUNT | 62988.0 | 11.839414 | 14.049471 | 2.00 | 3.000000 | 7.000000 | 15.000000 | 213.000000 | 0.0 |
BP_SUM | 62988.0 | 10925.081254 | 16339.486151 | 0.00 | 2518.000000 | 5700.000000 | 12831.000000 | 505308.000000 | 0.0 |
EP_SUM_YR_1 | 62988.0 | 0.000000 | 0.000000 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.0 |
EP_SUM_YR_2 | 62988.0 | 265.689623 | 1645.702854 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 74460.000000 | 0.0 |
SUM_YR_1 | 62437.0 | 5355.376064 | 8109.450147 | 0.00 | 1003.000000 | 2800.000000 | 6574.000000 | 239560.000000 | 551.0 |
SUM_YR_2 | 62850.0 | 5604.026014 | 8703.364247 | 0.00 | 780.000000 | 2773.000000 | 6845.750000 | 234188.000000 | 138.0 |
SEG_KM_SUM | 62988.0 | 17123.878691 | 20960.844623 | 368.00 | 4747.000000 | 9994.000000 | 21271.250000 | 580717.000000 | 0.0 |
WEIGHTED_SEG_KM | 62988.0 | 12777.152439 | 17578.586695 | 0.00 | 3219.045000 | 6978.255000 | 15299.632500 | 558440.140000 | 0.0 |
AVG_FLIGHT_COUNT | 62988.0 | 1.542154 | 1.786996 | 0.25 | 0.428571 | 0.875000 | 1.875000 | 26.625000 | 0.0 |
AVG_BP_SUM | 62988.0 | 1421.440249 | 2083.121324 | 0.00 | 336.000000 | 752.375000 | 1690.270833 | 63163.500000 | 0.0 |
BEGIN_TO_FIRST | 62988.0 | 120.145488 | 159.572867 | 0.00 | 9.000000 | 50.000000 | 166.000000 | 729.000000 | 0.0 |
LAST_TO_END | 62988.0 | 176.120102 | 183.822223 | 1.00 | 29.000000 | 108.000000 | 268.000000 | 731.000000 | 0.0 |
AVG_INTERVAL | 62988.0 | 67.749788 | 77.517866 | 0.00 | 23.370370 | 44.666667 | 82.000000 | 728.000000 | 0.0 |
MAX_INTERVAL | 62988.0 | 166.033895 | 123.397180 | 0.00 | 79.000000 | 143.000000 | 228.000000 | 728.000000 | 0.0 |
ADD_POINTS_SUM_YR_1 | 62988.0 | 540.316965 | 3956.083455 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 600000.000000 | 0.0 |
ADD_POINTS_SUM_YR_2 | 62988.0 | 814.689258 | 5121.796929 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 728282.000000 | 0.0 |
EXCHANGE_COUNT | 62988.0 | 0.319775 | 1.136004 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 46.000000 | 0.0 |
avg_discount | 62988.0 | 0.721558 | 0.185427 | 0.00 | 0.611997 | 0.711856 | 0.809476 | 1.500000 | 0.0 |
P1Y_Flight_Count | 62988.0 | 5.766257 | 7.210922 | 0.00 | 2.000000 | 3.000000 | 7.000000 | 118.000000 | 0.0 |
L1Y_Flight_Count | 62988.0 | 6.073157 | 8.175127 | 0.00 | 1.000000 | 3.000000 | 8.000000 | 111.000000 | 0.0 |
P1Y_BP_SUM | 62988.0 | 5366.720550 | 8537.773021 | 0.00 | 946.000000 | 2692.000000 | 6485.250000 | 246197.000000 | 0.0 |
L1Y_BP_SUM | 62988.0 | 5558.360704 | 9351.956952 | 0.00 | 545.000000 | 2547.000000 | 6619.250000 | 259111.000000 | 0.0 |
EP_SUM | 62988.0 | 265.689623 | 1645.702854 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 74460.000000 | 0.0 |
ADD_Point_SUM | 62988.0 | 1355.006223 | 7868.477000 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 984938.000000 | 0.0 |
Eli_Add_Point_Sum | 62988.0 | 1620.695847 | 8294.398955 | 0.00 | 0.000000 | 0.000000 | 345.000000 | 984938.000000 | 0.0 |
L1Y_ELi_Add_Points | 62988.0 | 1080.378882 | 5639.857254 | 0.00 | 0.000000 | 0.000000 | 0.000000 | 728282.000000 | 0.0 |
Points_Sum | 62988.0 | 12545.777100 | 20507.816700 | 0.00 | 2775.000000 | 6328.500000 | 14302.500000 | 985572.000000 | 0.0 |
L1Y_Points_Sum | 62988.0 | 6638.739585 | 12601.819863 | 0.00 | 700.000000 | 2860.500000 | 7500.000000 | 728282.000000 | 0.0 |
Ration_L1Y_Flight_Count | 62988.0 | 0.486419 | 0.319105 | 0.00 | 0.250000 | 0.500000 | 0.711111 | 1.000000 | 0.0 |
Ration_P1Y_Flight_Count | 62988.0 | 0.513581 | 0.319105 | 0.00 | 0.288889 | 0.500000 | 0.750000 | 1.000000 | 0.0 |
Ration_P1Y_BPS | 62988.0 | 0.522293 | 0.339632 | 0.00 | 0.258150 | 0.514252 | 0.815091 | 0.999989 | 0.0 |
Ration_L1Y_BPS | 62988.0 | 0.468422 | 0.338956 | 0.00 | 0.167954 | 0.476747 | 0.728375 | 0.999993 | 0.0 |
Point_NotFlight | 62988.0 | 2.728155 | 7.364164 | 0.00 | 0.000000 | 0.000000 | 1.000000 | 140.000000 | 0.0 |
AGE SUM_YR_1 SUM_YR_2有空值
#表头重命名
explore = explore[['null','max','min']]
explore.columns = [u'空值',u'最大值',u'最小值']
这里只选取部分探索结果
describe()函数自动计算的字段有count(非空值数)、unique(唯一值数)、top(频数最高者)、
freq(最高频数)、mean(平均值)、std(方差)、min(最小值)、50%(中位数)、max(最大值)
explore
空值 | 最大值 | 最小值 | |
---|---|---|---|
FFP_TIER | 0.0 | 6.000000 | 4.00 |
AGE | 420.0 | 110.000000 | 6.00 |
FLIGHT_COUNT | 0.0 | 213.000000 | 2.00 |
BP_SUM | 0.0 | 505308.000000 | 0.00 |
EP_SUM_YR_1 | 0.0 | 0.000000 | 0.00 |
EP_SUM_YR_2 | 0.0 | 74460.000000 | 0.00 |
SUM_YR_1 | 551.0 | 239560.000000 | 0.00 |
SUM_YR_2 | 138.0 | 234188.000000 | 0.00 |
SEG_KM_SUM | 0.0 | 580717.000000 | 368.00 |
WEIGHTED_SEG_KM | 0.0 | 558440.140000 | 0.00 |
AVG_FLIGHT_COUNT | 0.0 | 26.625000 | 0.25 |
AVG_BP_SUM | 0.0 | 63163.500000 | 0.00 |
BEGIN_TO_FIRST | 0.0 | 729.000000 | 0.00 |
LAST_TO_END | 0.0 | 731.000000 | 1.00 |
AVG_INTERVAL | 0.0 | 728.000000 | 0.00 |
MAX_INTERVAL | 0.0 | 728.000000 | 0.00 |
ADD_POINTS_SUM_YR_1 | 0.0 | 600000.000000 | 0.00 |
ADD_POINTS_SUM_YR_2 | 0.0 | 728282.000000 | 0.00 |
EXCHANGE_COUNT | 0.0 | 46.000000 | 0.00 |
avg_discount | 0.0 | 1.500000 | 0.00 |
P1Y_Flight_Count | 0.0 | 118.000000 | 0.00 |
L1Y_Flight_Count | 0.0 | 111.000000 | 0.00 |
P1Y_BP_SUM | 0.0 | 246197.000000 | 0.00 |
L1Y_BP_SUM | 0.0 | 259111.000000 | 0.00 |
EP_SUM | 0.0 | 74460.000000 | 0.00 |
ADD_Point_SUM | 0.0 | 984938.000000 | 0.00 |
Eli_Add_Point_Sum | 0.0 | 984938.000000 | 0.00 |
L1Y_ELi_Add_Points | 0.0 | 728282.000000 | 0.00 |
Points_Sum | 0.0 | 985572.000000 | 0.00 |
L1Y_Points_Sum | 0.0 | 728282.000000 | 0.00 |
Ration_L1Y_Flight_Count | 0.0 | 1.000000 | 0.00 |
Ration_P1Y_Flight_Count | 0.0 | 1.000000 | 0.00 |
Ration_P1Y_BPS | 0.0 | 0.999989 | 0.00 |
Ration_L1Y_BPS | 0.0 | 0.999993 | 0.00 |
Point_NotFlight | 0.0 | 140.000000 | 0.00 |
#平均折扣率最小为0但是总飞行公里数大于0,可能是客户乘坐0折机票或者积分兑换造成的
cond = data['avg_discount']==0
d = data[cond]
d['SEG_KM_SUM']
MEMBER_NO
1761 6138
15206 2158