1.导入相关的包
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
2.导入数据
df = pd.read_csv(r"C:\Python\基于数据挖掘技术的航空公司会员客户价值研究报告\air_data.csv")
df
![df](https://img-blog.csdnimg.cn/20200329093708600.PNG?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pfa2Fpeg==,size_16,color_FFFFFF,t_70)
df_field = pd.read_excel(r"C:\Python\基于数据挖掘技术的航空公司会员客户价值研究报告\数据特征说明.xlsx")
df_field
![在这里插入图片描述](https://img-blog.csdnimg.cn/20200329094123971.PNG?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pfa2Fpeg==,size_16,color_FFFFFF,t_70)
dfc = df.copy()
dfc.columns = df_field['属性名称']
dfc
![在这里插入图片描述](https://img-blog.csdnimg.cn/2020032909424183.PNG?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pfa2Fpeg==,size_16,color_FFFFFF,t_70)
3、选择模型指标
df_rfm = dfc.loc[:,['最后一次乘机时间至观察窗口末端时长','飞行次数','平均折扣率','观测窗口总飞行公里数']]
df_rfm.columns = ['R','F','D','K']
df_rfm
3.1、缺失值处理
df_rfm.isnull().sum()
3.2、异常值处理
df_rfm.describe()
![在这里插入图片描述](https://img-blog.csdnimg.cn/20200329094609357.PNG?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pfa2Fpeg==,size_16,color_FFFFFF,t_70)
df_rfm.describe([0.9,0.95])
![在这里插入图片描述](https://img-blog.csdnimg.cn/20200329094644412.PNG?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L0pfa2Fpeg==,size_16,color_FFFFFF,t_70)
F_095 = df_rfm['F'</