目录
1.数据探索与分析
#对数据进行基本的探索
#返回缺失值个数以及最大最小值
import pandas as pd
datafile = './air_data.csv'
resultfile = './result/explore.csv'
#读取原始数据,指定utf-8编码
data = pd.read_csv(datafile, encoding='utf-8')
#包括对数据的基本描述,percentiles参数是指定计算多少的分位数表
explore = data.describe(percentiles=[],include='all').T
#describe()函数自动计算非空数值,需要手动计算空值数
explore['null'] = len(data) - explore['count']
explore = explore[['null', 'max', 'min']]
#表头重命名
explore.columns = [u'空值数', u'最大值', u'最小值']
#导出结果
explore.to_csv(resultfile)
#对数据的分布分析
import pandas as pd
import matplotlib.pyplot as plt
datafile = './air_data.csv'
data = pd.read_csv(datafile, encoding='utf-8')
#客户信息类别
#提取会员入会年龄
from datetime import datetime
ffp = data['FFP_DATE'].apply(lambda x:datetime.strptime(x,'%Y/%m/%d'))
ffp_year = ffp.map(lambda x:x.year)
#绘制各年份会员入会人数直方图
fig = plt.figure(figsize=(8,5))
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
plt.hist(ffp_year, bins='auto', color='#0504aa')
plt.xlabel('年份')
plt.ylabel('入会人数')
plt.title('各年份会员入会人数')
plt.show()
plt.close()
#提取会员不同性别人数
male = pd.value_counts(data['GENDER'])['男']
female = pd.value_counts(data['GENDER'])['女']
#绘制会员性别比例饼图
fig = plt.figure(figsize=(7,4))
plt.pie([male, female], labels=['男', '女'], colors=['lightskyblue','lightcoral'], autopct='%1.1f%%')
plt.title('会员性别比例')
plt.show()
plt.close()
#提取不同级别会员的人数
lv_four = pd.value_counts(data['FFP_TIER'])[4]
lv_five = pd.value_counts(data['FFP_TIER'])[5]
lv_six = pd.value_counts(data['FFP_TIER'])[6]
#绘制会员各级人数条形图
fig = plt.figure(figsize=(8, 5))
plt.bar(x=range(3),height=[lv_four,lv_five,lv_six],width=0.4,alpha=0.8,color='skyblue')
plt.xticks([index for index in range(3)],['4','5','6'])
plt.xlabel('会员等级')
plt.ylabel('会员人数')
plt.title('会员各级别人数')
plt.show()
plt.close()
#提取会员年龄
age = data['AGE'].dropna()
age = age.astype('int64')
#绘制会员各年龄分布箱图
fig = plt.figure(figsize=(5,10))
plt.boxplot(age,patch_artist=True,labels=['会员年龄'],boxprops={'facecolor':'lightblue'})
plt.title('会员年龄分布箱线图')
#显示y坐标轴的底线
plt.grid(axis='y')
plt.show()
plt.close()
2.数据预处理
import pandas as pd
datafile = './air_data.csv'
cleanedfile = './result/data_cleaned.csv'
#读取数据
airline_data = pd.read_csv(datafile, encoding='utf-8')
print('原始数据的形状为:',airline_data.shape)
#去除票价为空的记录
airline_notnull = airline_data.loc[airline_data['SUM_YR_1'].notnull() & airline_data['SUM_YR_2'].notnull(),:]
print('删除缺失记录后数据的形状为:', airline_notnull.shape)
#只保留票价为非零的,或者平均折扣率不为0且总飞行数大于0的记录
index1 = airline_notnull['SUM_YR_1'] != 0
index2 = airline_notnull['SUM_YR_2'] != 0
index3 = (airline_notnull['SEG_KM_SUM']>0) & (airline_notnull['avg_discount']!=0)
#去除年龄大于100的记录
index4 = airline_notnull['AGE'] > 100
airline = airline_notnull[(index1 | index2) & index3 & ~index4]
print('数据清洗后数据的形状为:',airline.shape)
#保存清洗后的数据
airline.to_csv(cleanedfile)
3.数据变换
# 属性选择、构造与数据标准化
import pandas as pd
import numpy as np
#读取数据清洗后的数据
cleanedfile = './result/data_cleaned.csv'
airline = pd.read_csv(cleanedfile,encoding='utf-8')
#选择需求属性
airline_selection = airline[['FFP_DATE','LOAD_TIME','LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM','avg_discount']]
print('筛选后的属性前5行为:\n',airline_selection.head())
#构造属性L
L = pd.to_datetime(airline_selection['LOAD_TIME']) - pd.to_datetime(airline_selection['FFP_DATE'])
L = L.astype('str').str.split().str[0]
L = L.astype('int')/30
#合并属性
airline_features = pd.concat([L,airline_selection.iloc[:,2:]],axis=1)
airline_features.columns = ['L','R','F','M','C']
print('构建后LRFMC属性前5行为:',airline_features.head())
#数据标准化
from sklearn.preprocessing import StandardScaler
data = StandardScaler().fit_transform(airline_features)
np.savez('./result/airline_scale.npz', data)
print('标准化后LRFMC五个属性为:\n',data[:5,:])
4.模型构建
#客户聚类(K-means聚类)
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
if __name__ == '__main__':
#读取标准化后的数据
airline_scale = np.load('./result/airline_scale.npz')['arr_0']
#确定聚类中心数
k = 5
#模型构建,随机种子设为123
kmeans_model = KMeans(n_clusters=k,n_jobs=4,random_state=123)
#模型训练
fit_kmeans = kmeans_model.fit(airline_scale)
#查看聚类结果
kmeans_cc = kmeans_model.cluster_centers_
#聚类中心
print('各类聚类中心为:\n',kmeans_cc)
#样本的类别标签
kmeans_labels = kmeans_model.labels_
print('各样本的类别标签为:\n',kmeans_labels)
#统计不同类别样本的数目
r1 = pd.Series(kmeans_model.labels_).value_counts()
#输出聚类分群的结构
print('最终每个类别的数目为:\n',r1)
#将聚类中心放在数据框中 将样本类别作为数据框索引
cluster_center = pd.DataFrame(kmeans_model.cluster_centers_,columns=['ZL','ZR','ZF','ZM','ZC'])
cluster_center.index = pd.DataFrame(kmeans_model.labels_).drop_duplicates().iloc[:, 0]
print(cluster_center)
import matplotlib.pyplot as plt
#客户分群雷达图
labels = ['ZL','ZR','ZF','ZM','ZC']
#客户群命名,作为雷达图的图例
legen = ['客户群' + str(i+1) for i in cluster_center.index]
lstype = ['-','--',(0,(3,5,1,5,1,5)),':','-.']
kinds = list(cluster_center.iloc[:,0])
#由于雷达图要保证数据闭合,因此再添加L列,并转换为np.ndarray
cluster_center = pd.concat([cluster_center,cluster_center[['ZL']]],axis=1)
centers = np.array(cluster_center.iloc[:,0:])
#分割圆周长,并让其闭合
n = 5
angle = np.linspace(0,2 * np.pi, n, endpoint=False)
angle = np.concatenate((angle,[angle[0]]))
#绘图
fig = plt.figure(figsize=(8,6))
#以极坐标的形式绘制图形
ax = fig.add_subplot(111,polar=True)
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
#画线
for i in range(len(kinds)):
ax.plot(angle,centers[i],linestyle = lstype[i],linewidth=2,label=kinds[i])
#添加属性标签
ax.set_thetagrids(angle * 180/np.pi,labels + [labels[0]])
plt.title("客户特征分析雷达图")
plt.legend(legen)
plt.savefig("./result/航空公司客户群特征分布图.jpg")
plt.show()
plt.close()