航空客运信息挖掘

--star

于 2023-05-23 00:00:00 发布

阅读量262

点赞数

分类专栏：商务智能文章标签： python matplotlib 数据分析

本文链接：https://blog.csdn.net/m0_67240604/article/details/130750319

版权

商务智能专栏收录该内容

4 篇文章 0 订阅

订阅专栏

1.数据探索与分析

#对数据进行基本的探索
#返回缺失值个数以及最大最小值

import pandas as pd
datafile = './air_data.csv'
resultfile = './result/explore.csv'
#读取原始数据，指定utf-8编码
data = pd.read_csv(datafile, encoding='utf-8')
#包括对数据的基本描述，percentiles参数是指定计算多少的分位数表
explore = data.describe(percentiles=[],include='all').T
#describe（）函数自动计算非空数值，需要手动计算空值数
explore['null'] = len(data) - explore['count']
explore = explore[['null', 'max', 'min']]
#表头重命名
explore.columns = [u'空值数', u'最大值', u'最小值']
#导出结果
explore.to_csv(resultfile)

#对数据的分布分析
import pandas as pd
import matplotlib.pyplot as plt
datafile = './air_data.csv'
data = pd.read_csv(datafile, encoding='utf-8')
#客户信息类别
#提取会员入会年龄
from datetime import datetime
ffp = data['FFP_DATE'].apply(lambda x:datetime.strptime(x,'%Y/%m/%d'))
ffp_year = ffp.map(lambda x:x.year)
#绘制各年份会员入会人数直方图
fig = plt.figure(figsize=(8,5))
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
plt.hist(ffp_year, bins='auto', color='#0504aa')
plt.xlabel('年份')
plt.ylabel('入会人数')
plt.title('各年份会员入会人数')
plt.show()
plt.close()

#提取会员不同性别人数
male = pd.value_counts(data['GENDER'])['男']
female = pd.value_counts(data['GENDER'])['女']
#绘制会员性别比例饼图
fig = plt.figure(figsize=(7,4))
plt.pie([male, female], labels=['男', '女'], colors=['lightskyblue','lightcoral'], autopct='%1.1f%%')
plt.title('会员性别比例')
plt.show()
plt.close()

#提取不同级别会员的人数
lv_four = pd.value_counts(data['FFP_TIER'])[4]
lv_five = pd.value_counts(data['FFP_TIER'])[5]
lv_six = pd.value_counts(data['FFP_TIER'])[6]
#绘制会员各级人数条形图
fig = plt.figure(figsize=(8, 5))
plt.bar(x=range(3),height=[lv_four,lv_five,lv_six],width=0.4,alpha=0.8,color='skyblue')
plt.xticks([index for index in range(3)],['4','5','6'])
plt.xlabel('会员等级')
plt.ylabel('会员人数')
plt.title('会员各级别人数')
plt.show()
plt.close()

#提取会员年龄
age = data['AGE'].dropna()
age = age.astype('int64')
#绘制会员各年龄分布箱图
fig = plt.figure(figsize=(5,10))
plt.boxplot(age,patch_artist=True,labels=['会员年龄'],boxprops={'facecolor':'lightblue'})
plt.title('会员年龄分布箱线图')
#显示y坐标轴的底线
plt.grid(axis='y')
plt.show()
plt.close()

2.数据预处理

import pandas as pd

datafile = './air_data.csv'
cleanedfile = './result/data_cleaned.csv'

#读取数据
airline_data = pd.read_csv(datafile, encoding='utf-8')
print('原始数据的形状为：',airline_data.shape)

#去除票价为空的记录
airline_notnull = airline_data.loc[airline_data['SUM_YR_1'].notnull() & airline_data['SUM_YR_2'].notnull(),:]
print('删除缺失记录后数据的形状为：', airline_notnull.shape)

#只保留票价为非零的，或者平均折扣率不为0且总飞行数大于0的记录
index1 = airline_notnull['SUM_YR_1'] != 0
index2 = airline_notnull['SUM_YR_2'] != 0
index3 = (airline_notnull['SEG_KM_SUM']>0) & (airline_notnull['avg_discount']!=0)
#去除年龄大于100的记录
index4 = airline_notnull['AGE'] > 100
airline = airline_notnull[(index1 | index2) & index3 & ~index4]
print('数据清洗后数据的形状为：',airline.shape)
#保存清洗后的数据
airline.to_csv(cleanedfile)

3.数据变换

# 属性选择、构造与数据标准化

import pandas as pd
import numpy as np

#读取数据清洗后的数据
cleanedfile = './result/data_cleaned.csv'
airline = pd.read_csv(cleanedfile,encoding='utf-8')
#选择需求属性
airline_selection = airline[['FFP_DATE','LOAD_TIME','LAST_TO_END','FLIGHT_COUNT','SEG_KM_SUM','avg_discount']]
print('筛选后的属性前5行为：\n',airline_selection.head())
#构造属性L
L = pd.to_datetime(airline_selection['LOAD_TIME']) - pd.to_datetime(airline_selection['FFP_DATE'])
L = L.astype('str').str.split().str[0]
L = L.astype('int')/30

#合并属性
airline_features = pd.concat([L,airline_selection.iloc[:,2:]],axis=1)
airline_features.columns = ['L','R','F','M','C']
print('构建后LRFMC属性前5行为：',airline_features.head())

#数据标准化
from sklearn.preprocessing import StandardScaler
data = StandardScaler().fit_transform(airline_features)
np.savez('./result/airline_scale.npz', data)
print('标准化后LRFMC五个属性为：\n',data[:5,:])

4.模型构建

#客户聚类（K-means聚类）
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans

if __name__ == '__main__':
    #读取标准化后的数据
    airline_scale = np.load('./result/airline_scale.npz')['arr_0']
    #确定聚类中心数
    k = 5

    #模型构建，随机种子设为123
    kmeans_model = KMeans(n_clusters=k,n_jobs=4,random_state=123)
    #模型训练
    fit_kmeans = kmeans_model.fit(airline_scale)

    #查看聚类结果
    kmeans_cc = kmeans_model.cluster_centers_
    #聚类中心
    print('各类聚类中心为：\n',kmeans_cc)
    #样本的类别标签
    kmeans_labels = kmeans_model.labels_
    print('各样本的类别标签为：\n',kmeans_labels)
    #统计不同类别样本的数目
    r1 = pd.Series(kmeans_model.labels_).value_counts()
    #输出聚类分群的结构
    print('最终每个类别的数目为：\n',r1)
    #将聚类中心放在数据框中  将样本类别作为数据框索引
    cluster_center = pd.DataFrame(kmeans_model.cluster_centers_,columns=['ZL','ZR','ZF','ZM','ZC'])
    cluster_center.index = pd.DataFrame(kmeans_model.labels_).drop_duplicates().iloc[:, 0]
    print(cluster_center)


    import matplotlib.pyplot as plt
    #客户分群雷达图
    labels = ['ZL','ZR','ZF','ZM','ZC']

    #客户群命名，作为雷达图的图例
    legen = ['客户群' + str(i+1) for i in cluster_center.index]
    lstype = ['-','--',(0,(3,5,1,5,1,5)),':','-.']
    kinds = list(cluster_center.iloc[:,0])
    #由于雷达图要保证数据闭合，因此再添加L列，并转换为np.ndarray
    cluster_center = pd.concat([cluster_center,cluster_center[['ZL']]],axis=1)
    centers = np.array(cluster_center.iloc[:,0:])
    #分割圆周长，并让其闭合
    n = 5
    angle = np.linspace(0,2 * np.pi, n, endpoint=False)
    angle = np.concatenate((angle,[angle[0]]))

    #绘图
    fig = plt.figure(figsize=(8,6))
    #以极坐标的形式绘制图形
    ax = fig.add_subplot(111,polar=True)
    plt.rcParams["font.sans-serif"] = ["SimHei"]
    plt.rcParams["axes.unicode_minus"] = False

    #画线
    for i in range(len(kinds)):
        ax.plot(angle,centers[i],linestyle = lstype[i],linewidth=2,label=kinds[i])
    #添加属性标签

    ax.set_thetagrids(angle * 180/np.pi,labels + [labels[0]])
    plt.title("客户特征分析雷达图")
    plt.legend(legen)
    plt.savefig("./result/航空公司客户群特征分布图.jpg")
    plt.show()
    plt.close()