#准确的客户分类的结果是企业优化营销资源的重要依据,本文利用了航空公司的部分数据,利用Kmeans聚类方法,对航空公司的客户进行了分类,来识别出不同的客户群体,从来发现有用的客户,从而对不同价值的客户类别提供个性化服务,指定相应的营销策略。
# coding=utf-8
import pandas as pd
import numpy as np
#忽略报警
import warnings
warnings.filterwarnings("ignore")
#读取原始数据,指定UTF-8编码(需要用文本编辑器将数据装换为ANSI编码)
data = pd.read_csv(r'air_data - utf8.csv')
#查看样本
explore = data.describe(percentiles = [], include = 'all').T
data.head()
MEMBER_NO | FFP_DATE | FIRST_FLIGHT_DATE | GENDER | FFP_TIER | WORK_CITY | WORK_PROVINCE | WORK_COUNTRY | AGE | LOAD_TIME | ... | ADD_Point_SUM | Eli_Add_Point_Sum | L1Y_ELi_Add_Points | Points_Sum | L1Y_Points_Sum | Ration_L1Y_Flight_Count | Ration_P1Y_Flight_Count | Ration_P1Y_BPS | Ration_L1Y_BPS | Point_NotFlight | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 54993 | 2006/11/2 | 2008/12/24 | 男 | 6 | . | 北京 | CN | 31.0 | 2014/3/31 | ... | 39992 | 114452 | 111100 | 619760 | 370211 | 0.509524 | 0.490476 | 0.487221 | 0.512777 | 50 |
1 | 28065 | 2007/2/19 | 2007/8/3 | 男 | 6 | NaN | 北京 | CN | 42.0 | 2014/3/31 | ... | 12000 | 53288 | 53288 | 415768 | 238410 | 0.514286 | 0.485714 | 0.489289 | 0.510708 | 33 |
2 | 55106 | 2007/2/1 | 2007/8/30 | 男 | 6 | . | 北京 | CN | 40.0 | 2014/3/31 | ... | 15491 | 55202 | 51711 | 406361 | 233798 | 0.518519 | 0.481481 | 0.481467 | 0.518530 | 26 |
3 | 21189 | 2008/8/22 | 2008/8/23 | 男 | 5 | Los Angeles | CA | US | 64.0 | 2014/3/31 | ... | 0 | 34890 | 34890 | 372204 | 186100 | 0.434783 | 0.565217 | 0.551722 | 0.448275 | 12 |
4 | 39546 | 2009/4/10 | 2009/4/15 | 男 | 6 | 贵阳 | 贵州 | CN | 48.0 | 2014/3/31 | ... | 22704 | 64969 | 64969 | 338813 | 210365 | 0.532895 | 0.467105 | 0.469054 | 0.530943 | 39 |
5 rows × 44 columns
explore
count | unique | top | freq | mean | std | min | 50% | max | |
---|---|---|---|---|---|---|---|---|---|
MEMBER_NO | 62988 | NaN | NaN | NaN | 31494.5 | 18183.2 | 1 | 31494.5 | 62988 |
FFP_DATE | 62988 | 3068 | 2011/1/13 | 184 | NaN | NaN | NaN | NaN | NaN |
FIRST_FLIGHT_DATE | 62988 | 3406 | 2013/2/16 | 96 | NaN | NaN | NaN | NaN | NaN |
GENDER | 62985 | 2 | 男 | 48134 | NaN | NaN | NaN | NaN | NaN |
FFP_TIER | 62988 | NaN | NaN | NaN | 4.10216 | 0.373856 | 4 | 4 | 6 |
WORK_CITY | 60719 | 3309 | 广州 | 9385 | NaN | NaN | NaN | NaN | NaN |
WORK_PROVINCE | 59740 | 1183 | 广东 | 17507 | NaN | NaN | NaN | NaN | NaN |
WORK_COUNTRY | 62962 | 118 | CN | 57748 | NaN | NaN | NaN | NaN | NaN |
AGE | 62568 | NaN | NaN | NaN | 42.4763 | 9.88591 | 6 | 41 | 110 |
LOAD_TIME | 62988 | 1 | 2014/3/31 | 62988 | NaN | NaN | NaN | NaN | NaN |
FLIGHT_COUNT | 62988 | NaN | NaN | NaN | 11.8394 | 14.0495 | 2 | 7 | 213 |
BP_SUM | 62988 | NaN | NaN | NaN | 10925.1 | 16339.5 | 0 | 5700 | 505308 |
EP_SUM_YR_1 | 62988 | NaN | NaN | NaN | 0 | 0 | 0 | 0 | 0 |
EP_SUM_YR_2 | 62988 | NaN | NaN | NaN | 265.69 | 1645.7 | 0 | 0 | 74460 |
SUM_YR_1 | 62437 | NaN | NaN | NaN | 5355.38 | 8109.45 | 0 | 2800 | 239560 |
SUM_YR_2 | 62850 | NaN | NaN | NaN | 5604.03 | 8703.36 | 0 | 2773 | 234188 |
SEG_KM_SUM | 62988 | NaN | NaN | NaN | 17123.9 | 20960.8 | 368 | 9994 | 580717 |
WEIGHTED_SEG_KM | 62988 | NaN | NaN | NaN | 12777.2 | 17578.6 | 0 | 6978.26 | 558440 |
LAST_FLIGHT_DATE | 62988 | 731 | 2014/3/31 | 959 | NaN | NaN | NaN | NaN | NaN |
AVG_FLIGHT_COUNT | 62988 | NaN | NaN | NaN | 1.54215 | 1.787 | 0.25 | 0.875 | 26.625 |
AVG_BP_SUM | 62988 | NaN | NaN | NaN | 1421.44 | 2083.12 | 0 | 752.375 | 63163.5 |
BEGIN_TO_FIRST | 62988 | NaN | NaN | NaN | 120.145 | 159.573 | 0 | 50 | 729 |
LAST_TO_END | 62988 | NaN | NaN | NaN | 176.12 | 183.822 | 1 | 108 | 731 |
AVG_INTERVAL | 62988 | NaN | NaN | NaN | 67.7498 | 77.5179 | 0 | 44.6667 | 728 |
MAX_INTERVAL | 62988 | NaN | NaN | NaN | 166.034 | 123.397 | 0 | 143 | 728 |
ADD_POINTS_SUM_YR_1 | 62988 | NaN | NaN | NaN | 540.317 | 3956.08 | 0 | 0 | 600000 |
ADD_POINTS_SUM_YR_2 | 62988 | NaN | NaN | NaN | 814.689 | 5121.8 | 0 | 0 | 728282 |
EXCHANGE_COUNT | 62988 | NaN | NaN | NaN | 0.319775 | 1.136 | 0 | 0 | 46 |
avg_discount | 62988 | NaN | NaN | NaN | 0.721558 | 0.185427 | 0 | 0.711856 | 1.5 |
P1Y_Flight_Count | 62988 | NaN | NaN | NaN | 5.76626 | 7.21092 | 0 | 3 | 118 |
L1Y_Flight_Count | 62988 | NaN | NaN | NaN | 6.07316 | 8.17513 | 0 | 3 | 111 |
P1Y_BP_SUM | 62988 | NaN | NaN | NaN | 5366.72 | 8537.77 | 0 | 2692 | 246197 |
L1Y_BP_SUM | 62988 | NaN | NaN | NaN | 5558.36 | 9351.96 | 0 | 2547 | 259111 |
EP_SUM | 62988 | NaN | NaN | NaN | 265.69 | 1645.7 | 0 | 0 | 74460 |
ADD_Point_SUM | 62988 | NaN | NaN | NaN | 1355.01 | 7868.48 | 0 | 0 | 984938 |
Eli_Add_Point_Sum | 62988 | NaN | NaN | NaN | 1620.7 | 8294.4 | 0 | 0 | 984938 |
L1Y_ELi_Add_Points | 62988 | NaN | NaN | NaN | 1080.38 | 5639.86 | 0 | 0 | 728282 |
Points_Sum | 62988 | NaN | NaN | NaN | 12545.8 | 20507.8 | 0 | 6328.5 | 985572 |
L1Y_Points_Sum | 62988 | NaN | NaN | NaN | 6638.74 | 12601.8 | 0 | 2860.5 | 728282 |
Ration_L1Y_Flight_Count | 62988 | NaN | NaN | NaN | 0.486419 | 0.319105 | 0 | 0.5 | 1 |
Ration_P1Y_Flight_Count | 62988 | NaN | NaN | NaN | 0.513581 | 0.319105 | 0 | 0.5 | 1 |
Ration_P1Y_BPS | 62988 | NaN | NaN | NaN | 0.522293 | 0.339632 | 0 | 0.514252 | 0.999989 |
Ration_L1Y_BPS | 62988 | NaN | NaN | NaN | 0.468422 | 0.338956 | 0 | 0.476747 | 0.999993 |
Point_NotFlight | 62988 | NaN | NaN | NaN | 2.72815 | 7.36416 | 0 | 0 | 140 |
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62988 entries, 0 to 62987
Data columns (total 44 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MEMBER_NO 62988 non-null int64
1 FFP_DATE 62988 non-null object
2 FIRST_FLIGHT_DATE 62988 non-null object
3 GENDER 62985 non-null object
4 FFP_TIER 62988 non-null int64
5 WORK_CITY 60719 non-null object
6 WORK_PROVINCE 59740 non-null object
7 WORK_COUNTRY 62962 non-null object
8 AGE 62568 non-null float64
9 LOAD_TIME 62988 non-null object
10 FLIGHT_COUNT 62988 non-null int64
11 BP_SUM 62988 non-null int64
12 EP_SUM_YR_1 62988 non-null int64
13 EP_SUM_YR_2 62988 non-null int64
14 SUM_YR_1 62437 non-null float64
15 SUM_YR_2 62850 non-null float64
16 SEG_KM_SUM 62988 non-null int64
17 WEIGHTED_SEG_KM 62988 non-null float64
18 LAST_FLIGHT_DATE 62988 non-null object
19 AVG_FLIGHT_COUNT 62988 non-null float64
20 AVG_BP_SUM 62988 non-null float64
21 BEGIN_TO_FIRST 62988 non-null int64
22 LAST_TO_END 62988 non-null int64
23 AVG_INTERVAL 62988 non-null float64
24 MAX_INTERVAL 62988 non-null int64
25 ADD_POINTS_SUM_YR_1 62988 non-null int64
26 ADD_POINTS_SUM_YR_2 62988 non-null int64
27 EXCHANGE_COUNT 62988 non-null int64
28 avg_discount 62988 non-null float64
29 P1Y_Flight_Count 62988 non-null int64
30 L1Y_Flight_Count 62988 non-null int64
31 P1Y_BP_SUM 62988 non-null int64
32 L1Y_BP_SUM 62988 non-null int64
33 EP_SUM 62988 non-null int64
34 ADD_Point_SUM 62988 non-null int64
35 Eli_Add_Point_Sum 62988 non-null int64
36 L1Y_ELi_Add_Points 62988 non-null int64
37 Points_Sum 62988 non-null int64
38 L1Y_Points_Sum 62988 non-null int64
39 Ration_L1Y_Flight_Count 62988 non-null float64
40 Ration_P1Y_Flight_Count 62988 non-null float64
41 Ration_P1Y_BPS 62988 non-null float64
42 Ration_L1Y_BPS 62988 non-null float64
43 Point_NotFlight 62988 non-null int64
dtypes: float64(12), int64(24), object(8)
memory usage: 21.1+ MB
#去重
data.drop_duplicates(inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 62988 entries, 0 to 62987
Data columns (total 44 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MEMBER_NO 62988 non-null int64
1 FFP_DATE 62988 non-null object
2 FIRST_FLIGHT_DATE 62988 non-null object
3 GENDER 62985 non-null object
4 FFP_TIER 62988 non-null int64
5 WORK_CITY 60719 non-null object
6 WORK_PROVINCE 59740 non-null object
7 WORK_COUNTRY 62962 non-null object
8 AGE 62568 non-null float64
9 LOAD_TIME 62988 non-null object
10 FLIGHT_COUNT 62988 non-null int64
11 BP_SUM 62988 non-null int64
12 EP_SUM_YR_1 62988 non-null int64
13 EP_SUM_YR_2 62988 non-null int64
14 SUM_YR_1 62437 non-null float64
15 SUM_YR_2 62850 non-null float64
16 SEG_KM_SUM 62988 non-null int64
17 WEIGHTED_SEG_KM 62988 non-null float64
18 LAST_FLIGHT_DATE 62988 non-null object
19 AVG_FLIGHT_COUNT 62988 non-null float64
20 AVG_BP_SUM 62988 non-null float64
21 BEGIN_TO_FIRST 62988 non-null int64
22 LAST_TO_END 62988 non-null int64
23 AVG_INTERVAL 62988 non-null float64
24 MAX_INTERVAL 62988 non-null int64
25 ADD_POINTS_SUM_YR_1 62988 non-null int64
26 ADD_POINTS_SUM_YR_2 62988 non-null int64
27 EXCHANGE_COUNT 62988 non-null int64
28 avg_discount 62988 non-null float64
29 P1Y_Flight_Count 62988 non-null int64
30 L1Y_Flight_Count 62988 non-null int64
31 P1Y_BP_SUM 62988 non-null int64
32 L1Y_BP_SUM 62988 non-null int64
33 EP_SUM 62988 non-null int64
34 ADD_Point_SUM 62988 non-null int64
35 Eli_Add_Point_Sum 62988 non-null int64
36 L1Y_ELi_Add_Points 62988 non-null int64
37 Points_Sum 62988 non-null int64
38 L1Y_Points_Sum 62988 non-null int64
39 Ration_L1Y_Flight_Count 62988 non-null float64
40 Ration_P1Y_Flight_Count 62988 non-null float64
41 Ration_P1Y_BPS 62988 non-null float64
42 Ration_L1Y_BPS 62988 non-null float64
43 Point_NotFlight 62988 non-null int64
dtypes: float64(12), int64(24), object(8)
memory usage: 21.6+ MB
MEMBER_NO 会员卡号
FFP_DATE 入会时间
FIRST_FLIGHT_DATE 第一次飞行时间
GENDER 性别
FFP_TIER 会员卡级别
WORK_CITY 城市
WORK_PROVINCE 省份
WORK_COUNTRY 国家
AGE 年龄
LOAD_TIME 观测窗口结束时间
FLIGHT_COUNT 观测窗口内飞行次数
BP_SUM 总基本积分
EP_SUM_YR_1
EP_SUM_YR_2
SUM_YR_1 第一年总票价
SUM_YR_2 第二年总票价
SEG_KM_SUM 观测窗口的总飞行公里数
WEIGHTED_SEG_KM
LAST_FLIGHT_DATE
AVG_FLIGHT_COUNT 平均飞次数
AVG_BP_SUM
BEGIN_TO_FIRST
LAST_TO_END
AVG_INTERVAL 平均时间间隔
MAX_INTERVAL 最大时间间隔
ADD_POINTS_SUM_YR_1
ADD_POINTS_SUM_YR_2
EXCHANGE_COUNT
avg_discount 平均折扣率
P1Y_Flight_Count
L1Y_Flight_Count
P1Y_BP_SUM
L1Y_BP_SUM
EP_SUM
ADD_Point_SUM
Eli_Add_Point_Sum
L1Y_ELi_Add_Points
Points_Sum
L1Y_Points_Sum
Ration_L1Y_Flight_Count
Ration_P1Y_Flight_Count
Ration_P1Y_BPS
Ration_L1Y_BPS
Point_NotFlight 非乘机的积分变动次数
data['GENDER'].value_counts()
男 48134
女 14851
Name: GENDER, dtype: int64
data['GENDER'].fillna('男')
0 男
1 男
2 男
3 男
4 男
..
62983 女
62984 男
62985 女
62986 女
62987 女
Name: GENDER, Length: 62988, dtype: object
“FFP_DATE”, “LOAD_TIME”, “FLIGHT_COUNT”, “SUM_YR_1”, “SUM_YR_2”, “SEG_KM_SUM”, “AVG_INTERVAL” , “MAX_INTERVAL”, “avg_discount”
FFP_DATE 入会时间
LOAD_TIME 观测窗口结束时间
FLIGHT_COUNT 观测窗口内飞行次数
SUM_YR_1 第一年总票价
SUM_YR_2 第二年总票价
AVG_INTERVAL 平均时间间隔
MAX_INTERVAL 最大时间间隔
avg_discount 平均折扣率
选取的特征是第一年总票价、第二年总票价、观测窗口总飞行公里数是要计算平均飞行每公里的票价,因为对于航空公司来说并不是票价越高,飞行公里数越长越能创造利润,相反而是那些近距离的高等舱的客户创造更大的利益。
当然总飞行公里数、飞行次数也都是评价一个客户价值的重要的指标
入会时间可以看出客户是不是老用户及忠诚度
通过平均乘机时间间隔、观察窗口内最大乘机间隔可以判断客户的乘机频率是不是固定
平均折扣率可以反映出客户给公里带来的利益,毕竟来说越是高价值的客户享用的折扣率越高
“入会时间”, “飞行次数”, “平均每公里票价”, “总里程”, “时间间隔差值”, “平均折扣率”
filter_data = data[[ "FFP_DATE", "LOAD_TIME", "FLIGHT_COUNT", "SUM_YR_1", "SUM_YR_2", "SEG_KM_SUM", "AVG_INTERVAL" , "MAX_INTERVAL", "avg_discount"]]
filter_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 62988 entries, 0 to 62987
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 FFP_DATE 62988 non-null object
1 LOAD_TIME 62988 non-null object
2 FLIGHT_COUNT 62988 non-null int64
3 SUM_YR_1 62437 non-null float64
4 SUM_YR_2 62850 non-null float64
5 SEG_KM_SUM 62988 non-null int64
6 AVG_INTERVAL 62988 non-null float64
7 MAX_INTERVAL 62988 non-null int64
8 avg_discount 62988 non-null float64
dtypes: float64(4), int64(3), object(2)
memory usage: 4.8+ MB
filter_data['SUM_YR_1'].fillna(filter_data['SUM_YR_1'].mean(),inplace=True)
filter_data['SUM_YR_2'].fillna(filter_data['SUM_YR_2'].mean(),inplace=True)
filter_data.describe([.02,.10,.25,.5,.75,.90,.99]).T
count | mean | std | min | 2% | 10% | 25% | 50% | 75% | 90% | 99% | max | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
FLIGHT_COUNT | 62988.0 | 11.839414 | 14.049471 | 2.0 | 2.0000 | 2.000000 | 3.000000 | 7.000000 | 15.000000 | 27.00 | 69.00 | 213.0 |
SUM_YR_1 | 62988.0 | 5355.376064 | 8073.902161 | 0.0 | 0.0000 | 0.000000 | 1020.000000 | 2844.000000 | 6524.250000 | 12939.00 | 37858.47 | 239560.0 |
SUM_YR_2 | 62988.0 | 5604.026014 | 8693.824796 | 0.0 | 0.0000 | 0.000000 | 785.000000 | 2784.000000 | 6826.250000 | 14065.90 | 41179.73 | 234188.0 |
SEG_KM_SUM | 62988.0 | 17123.878691 | 20960.844623 | 368.0 | 1475.7400 | 2727.000000 | 4747.000000 | 9994.000000 | 21271.250000 | 39729.60 | 100841.28 | 580717.0 |
AVG_INTERVAL | 62988.0 | 67.749788 | 77.517866 | 0.0 | 2.0000 | 9.729730 | 23.370370 | 44.666667 | 82.000000 | 146.00 | 412.00 | 728.0 |
MAX_INTERVAL | 62988.0 | 166.033895 | 123.397180 | 0.0 | 2.0000 | 18.000000 | 79.000000 | 143.000000 | 228.000000 | 339.00 | 551.00 | 728.0 |
avg_discount | 62988.0 | 0.721558 | 0.185427 | 0.0 | 0.3775 | 0.508989 | 0.611997 | 0.711856 | 0.809476 | 0.92 | 1.41 | 1.5 |
data["LOAD_TIME"] = pd.to_datetime(data["LOAD_TIME"])
data["FFP_DATE"] = pd.to_datetime(data["FFP_DATE"])
data["入会时间"] = data["LOAD_TIME"] - data["FFP_DATE"]
data["平均每公里票价"] = (data["SUM_YR_1"] + data["SUM_YR_2"]) / data["SEG_KM_SUM"]
data["时间间隔差值"] = data["MAX_INTERVAL"] - data["AVG_INTERVAL"]
deal_data = data.rename(
columns = {"FLIGHT_COUNT" : "飞行次数", "SEG_KM_SUM" : "总里程", "avg_discount" : "平均折扣率"},
inplace = False
)
filter_data = deal_data[["入会时间", "飞行次数", "平均每公里票价", "总里程", "时间间隔差值", "平均折扣率"]]
filter_data['平均每公里票价'] = filter_data['平均每公里票价'].fillna(value = 0)
filter_data.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 62988 entries, 0 to 62987
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 入会时间 62988 non-null timedelta64[ns]
1 飞行次数 62988 non-null int64
2 平均每公里票价 62988 non-null float64
3 总里程 62988 non-null int64
4 时间间隔差值 62988 non-null float64
5 平均折扣率 62988 non-null float64
dtypes: float64(3), int64(2), timedelta64[ns](1)
memory usage: 3.4 MB
filter_data['入会时间'].dt.days
0 2706
1 2597
2 2615
3 2047
4 1816
...
62983 1046
62984 1484
62985 2923
62986 418
62987 407
Name: 入会时间, Length: 62988, dtype: int64
#filter_data['入会时间'] = filter_data['入会时间']/(60*60*24*10**9)
filter_data['入会时间']=filter_data['入会时间'].dt.days
from sklearn.preprocessing import StandardScaler
standard = StandardScaler()
standard.fit(filter_data)
StandardScaler()
S_data = pd.DataFrame(standard.transform(filter_data))
S_data.columns=["入会时间", "飞行次数", "平均每公里票价", "总里程", "时间间隔差值", "平均折扣率"]
S_data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 62988 entries, 0 to 62987
Data columns (total 6 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 入会时间 62988 non-null float64
1 飞行次数 62988 non-null float64
2 平均每公里票价 62988 non-null float64
3 总里程 62988 non-null float64
4 时间间隔差值 62988 non-null float64
5 平均折扣率 62988 non-null float64
dtypes: float64(6)
memory usage: 2.9 MB
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
#返回所有轮廓系数的均值
from sklearn.metrics import silhouette_samples
#返回每个样本的自己的轮廓系数
inertia = []
silhouette = []
for i in range(2,10):
cluster = KMeans(n_clusters=i,random_state=0,n_jobs=4).fit(S_data)
inertia.append(cluster.inertia_)
silhouette.append(silhouette_score(S_data,cluster.labels_))
print(inertia)
print(silhouette)
[300992.94143881754, 249961.1287967345, 212929.66150454507, 187429.24421259848, 170776.80489673465, 154981.14913712352, 145834.9083294653, 138235.00566447436]
[0.3592867371629582, 0.21454689862059148, 0.20674237627094663, 0.2213072501911795, 0.2103711574222313, 0.21673681639729542, 0.20137242231980962, 0.2065838067406841]
from matplotlib import pyplot as plt
#画图,通过观察SSE与k的取值尝试找出合适的k值
# 中文和负号的正常显示
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['font.size'] = 12.0
plt.rcParams['axes.unicode_minus'] = False
# 使用ggplot的绘图风格
plt.style.use('ggplot')
fig=plt.figure(figsize=(10, 8))
ax=fig.add_subplot(1,1,1)
ax.plot(range(2,10),inertia,marker="+")
ax.set_xlabel("n_clusters", fontsize=18)
fig.suptitle("KMeans", fontsize=20)
plt.show()
#画图,通过观察SSE与k的取值尝试找出合适的k值
# 中文和负号的正常显示
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['font.size'] = 12.0
plt.rcParams['axes.unicode_minus'] = False
# 使用ggplot的绘图风格
plt.style.use('ggplot')
fig=plt.figure(figsize=(10, 8))
ax=fig.add_subplot(1,1,1)
ax.plot(range(2,10),silhouette,marker="+")
ax.set_xlabel("n_clusters", fontsize=18)
fig.suptitle("KMeans", fontsize=20)
plt.show()
for i in range(4,9,2):
kmodel = KMeans(n_clusters=i, n_jobs=4)
kmodel.fit(S_data)
# 简单打印结果
r1 = pd.Series(kmodel.labels_).value_counts() #统计各个类别的数目
r2 = pd.DataFrame(kmodel.cluster_centers_) #找出聚类中心
# 所有簇中心坐标值中最大值和最小值
max = r2.values.max()
min = r2.values.min()
r = pd.concat([r2, r1], axis = 1) #横向连接(0是纵向),得到聚类中心对应的类别下的数目
r.columns = list(S_data.columns) + [u'类别数目'] #重命名表头
# 绘图
fig=plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, polar=True)
center_num = r.values
feature = ["入会时间", "飞行次数", "平均每公里票价", "总里程", "时间间隔差值", "平均折扣率"]
N =len(feature)
for i, v in enumerate(center_num):
# 设置雷达图的角度,用于平分切开一个圆面
angles=np.linspace(0, 2*np.pi, N, endpoint=False)
# 为了使雷达图一圈封闭起来,需要下面的步骤
center = np.concatenate((v[:-1],[v[0]]))
angles=np.concatenate((angles,[angles[0]]))
# 绘制折线图
ax.plot(angles, center, 'o-', linewidth=2, label = "第%d簇人群,%d人"% (i+1,v[-1]))
# 填充颜色
ax.fill(angles, center, alpha=0.25)
# 添加每个特征的标签
ax.set_thetagrids(angles[:-1] * 180/np.pi, feature, fontsize=15)
# 设置雷达图的范围
ax.set_ylim(min-0.1, max+0.1)
# 添加标题
plt.title('客户群特征分析图', fontsize=20)
# 添加网格线
ax.grid(True)
# 设置图例
plt.legend(loc='upper right', bbox_to_anchor=(1.3,1.0),ncol=1,fancybox=True,shadow=True)
# 显示图形
plt.show()
第一簇人群,9991人,最大的特点是时间间隔差值最大,分析可能是“季节型客户”,一年中在某个时间段需要多次乘坐飞机进行旅行,其他的时间则出行的不多,这类客户我们需要在保持的前提下,进行一定的发展;
第二簇人群,3157人,最大的特点就是平均每公里票价和平均折扣率都是最高的,应该是属于乘坐高等舱的商务人员,应该重点保持的对象,也是需要重点发展的对象,另外应该积极采取相关的优惠政策是他们的乘坐次数增加,有钱人;
第三簇人群,16245人,入会时间较短,每公里票价和平均折扣率属于较高的 属于新用户
第四簇人群,5221人, 总里程和飞行次数都是最多的,而且平均每公里票价也较高,是重点保持对象
第五簇人群,14357人,最大的特点就是入会的时间较长,属于老客户按理说平均折扣率应该较高才对,但是观察窗口的平均折扣率较低,而且总里程和总次数都不高,分析可能是流失的客户;
第六簇人群,14027人,各方面的数据都是比较低的,属于一般或低价值用户