import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data = pd.read_csv(r'air_data.csv', encoding='ansi', sep=',', engine='python')
print(data.info())
#一、数据清洗:
# 1. 将票价为空值删掉:
mask1 = data['SUM_YR_1'].isnull()
mask2 = data['SUM_YR_2'].isnull()
#2. 去空:
data.dropna(subset=['SUM_YR_1', 'SUM_YR_2'], axis=0, how='any', inplace=True)
print(data.shape)
#2. 保留票价为不为0,平均折扣率不为0,总飞行公里数大于0的记录。
#(1)票价不为0的情况:
mask3 = data['SUM_YR_1'] !=0
mask4 = data['SUM_YR_2'] !=0
conditon1 = mask3 | mask4
#(2)平均折扣率不为0
conditon2 = data['avg_discount'] !=0
#(3) 总飞行公里数大于0
conditon3 = data['SEG_KM_SUM'] > 0
conditon = conditon1 & conditon2 & conditon3
#二、特征构建:
# 1. LRFMC
#L 构建:入会时间距离观测窗口结束的月数:
data['LOAD_TIME'] = pd.to_datetime(data['LOAD_TIME'])
data['FFP_DATE'] = pd.to_datetime(data['FFP_DATE'])
data['L'] = (data['LOAD_TIME'] - data['FFP_DATE']) / np.timedelta64(1, 'M')
# print(((data['LOAD_TIME'] - data['FFP_DATE']) / 30).dt.days)
#2. 改名:
data.rename(columns={'LAST_TO_END':'R', 'FLIGHT_COUNT':'F','SEG_KM_SUM':'M','avg_discount':'C'},inplace=True)
#3. 构建特征值:
features = data.loc[:, ['L','R','F','M','C']]
print(features.head())
#三. 数据标准化:标准差标准化:均值为0, 方差为1;
from sklearn.preprocessing import StandardScaler
#(1) 实例化,(2) 转换数据 :(3)返回数组类型,返回值不能再用df的方法
fetaure_data = StandardScaler().fit_transform(features.values)
print(fetaure_data[:5, :])
#四、K-Means算法:
from sklearn.cluster import KMeans
#1.实例化:
clf = KMeans(n_clusters=5)
#2.训练:
clf.fit(fetaure_data)
#3.查看聚类中心:
center = clf.cluster_centers_
print('最新的聚类中心点:\n', center)
#4. 查看聚类结果:
print(clf.labels_)
plt.figure()
x=np.linspace(0,2*np.pi,5,endpoint=False)
# 角度闭合
x_bihe=np.concatenate((x,[x[0]]),axis=0)
# y值闭合:
L_center=center[:,0].reshape((5,1))
center_bihe=np.concatenate((center,L_center),axis=1)
# 绘图
for i in range(5):
plt.polar(x_bihe,center_bihe[i,:],marker='o',linestyle='-.')
plt.fill(x_bihe,center_bihe[i,:],alpha=0.2)
plt.xticks(x,['L','R','F','M','C'])
plt.show()
Kmeans算法 航空数据分析
最新推荐文章于 2023-12-31 01:48:07 发布