import numpy as np import pandas as pd import matplotlib.pyplot as plt #一.缺失值与异常值处理 data=pd.read_csv(r'air_data.csv',encoding='gb18030') print(data.shape) #删除票价为空的值 data.dropna(axis=0,how='any',subset=['SUM_YR_1','SUM_YR_2'],inplace=True) print(data.shape) #删除票价为0的值 mask=(data['SUM_YR_1']==0)&(data['SUM_YR_2']==0) labels=data.index[mask] data.drop(labels=labels,inplace=True,axis=0) print(data.shape) ''' (62988, 44) (62299, 44) (62044, 44) ''' #二.提取五大特征 #(1)会员入会时间距观测窗口结束的月数 L=观测窗口的结束时间一入会时间(单位月 # L=LOAD TIME-FFP DATE FFP_DATE=pd.to_datetime(data['FFP_DATE']).dt.date LOAD_TIME=pd.to_datetime(data['LOAD_TIME']).dt.date # print(FFP_DATE) # L1=(LOAD_TIME-FFP_DATE) # print(L1) L=(LOAD_TIME-FFP_DATE)/30 # print(L) ''' 0 90 days 04:48:00 1 86 days 13:36:00 2 87 days 04:00:00 ''' data['L']=L.dt.days # print(data['L']) ''' 0 90 1 86 2 87 ''' # (2)客户最近一次乘坐公司飞机距观测窗口结束的月数 R=最后一次乘机时间至观察窗口 末端时长(单位:月), #R = RELAST TO END LAST_TO_END=data['LAST_TO_END']//30 # print(LAST_TO_END) data['R']=LAST_TO_END # print(data['R']) ''' /30 0 0.033333 1 0.233333 2 0.366667 3 3.233333 ''' ''' //30 0 0 1 0 2 0 3 3 ''' # (3)客户在观测窗 口内乘坐公司 飞机的次数 # F=FLIGHT-COUNT FLIGHT_COUNT=data['FLIGHT_COUNT'] data['F']=FLIGHT_COUNT #(4)客户在观测窗口内飞行里程 M= 观测窗口总飞行千米数(单位:千米) # M=SEG_KM_SUM SEG_KM_SUM=data['SEG_KM_SUM'] data['M']=SEG_KM_SUM #(5)客户在观测窗口乘坐舱位对应的折扣系数的平均值 c=平均折扣率(单位:无) # C=AVG_DISCOUNT avg_discount=data['avg_discount'] data['C']=avg_discount # print(data) #三.数据标准化 def data_scal(data): data['L标']=(data['L']-data['L'].min())/(data['L'].max()-data['L'].min()) data['R标']=(data['R']-data['R'].min())/(data['R'].max()-data['R'].min()) data['F标']=(data['F']-data['F'].min())/(data['F'].max()-data['F'].min()) data['M标'] =(data['M']-data['M'].min())/(data['M'].max()-data['M'].min()) data['C标'] =(data['C']-data['C'].min())/(data['C'].max()-data['C'].min()) return data data=data_scal(data) #四.模块化 K_means算法聚类 from sklearn.cluster import KMeans x=data[['L标','R标','F标','M标','C标']] kms=KMeans(n_clusters=5) y=kms.fit_predict(x) data['index1']=y #分类索引 # print(y) center=data[['L标','R标','F标','M标','C标','index1']].groupby(by='index1').mean() #分类取类中心 center['L标2']=center['L标'] print(center) ''' L标 R标 F标 M标 C标 L标2 index1 0 0.155161 0.615177 0.010573 0.011052 0.420774 0.155161 1 0.775075 0.083877 0.078814 0.045059 0.450950 0.775075 2 0.125223 0.106254 0.046397 0.028963 0.418198 0.125223 3 0.440348 0.097372 0.060490 0.035447 0.435148 0.440348 4 0.640012 0.598472 0.012530 0.011577 0.432559 0.640012 ''' # print(data) #画雷达图: plt.rcParams['font.sans-serif'] = 'SimHei'##仿宋 plt.rcParams['axes.unicode_minus'] = False ##设置正常显示符号 plt.figure() dataLength=5 angles=np.linspace(0,2*np.pi,dataLength,endpoint=False) angles2=np.concatenate((angles,np.array([angles[0]]))) #闭合 # print(angles2) labels = ['L标','R标','F标','M标','C标'] for i in range(5): plt.polar(angles2,center.values[i]) plt.fill(angles2,center.values[i],alpha=0.25) #填充颜色 plt.xticks(angles,labels) plt.show()
k-means算法(航空分析)
最新推荐文章于 2023-06-12 09:49:16 发布