K-Means算法对航空公司客户进行分类
1.数据预处理:
import numpy as np
import pandas as pd
data = pd.read_csv("./air_data.csv", encoding='gb18030', engine='python')
print(data.columns)
print(data.shape)
# 保留票价不为空的值
mask1 = data["SUM_YR_1"].notnull() & data["SUM_YR_2"].notnull()
data1 = data.loc[mask1, :]
print(data1.shape)
# 保留票价不为0,平均折扣率不为0,总飞行公里数大于0的记录
mask2 = data["SUM_YR_1"] != 0
mask3 = data["SUM_YR_2"] != 0
mask4 = data["avg_discount"] != 0
mask5 = data["SEG_KM_SUM"] > 0
mask = mask4 & mask5 & (mask2 | mask3)
airline = data1.loc[mask, :]
print(airline.shape)
# 选取需求特征
airline_selection = airline[["FFP_DATE", "LOAD_TIME", "FLIGHT_COUNT", "LAST_TO_END", "avg_discount", "SEG_KM_SUM"]]
# 构建L特征
L = pd.to_datetime(airline_selection["LOAD_TIME"]) - pd.to_d