数据预处理——数据挖掘1

tick-tick

已于 2022-02-13 17:31:32 修改

阅读量1.2k

点赞数 3

文章标签：数据挖掘 python 数据分析

于 2022-02-13 00:52:55 首次发布

本文链接：https://blog.csdn.net/weixin_46443403/article/details/122903989

版权

将数据中“？”标志的缺失数据补齐。
采用“均值替换”的方法补齐缺失数据，使用每列数据的均值替换该列数据的缺失值。

#导入pandas库对文件数据进行操作
import pandas as pd
#读取文件数据集
df = pd.read_excel('作业1_数据预处理数据集.xls')
#计算每列均值填补对应列的缺失值
df.fillna(value = df.mean(),inplace=True)
#将完成填充的数据集导出到另一个excel文件中
df.to_excel("作业1_缺失值已填充.xlsx",index=False)

计算每个数字维度的四分位数，并做出盒图。
计算每个数字维度的四分位数：
方法一：

#读取已填补好缺失值的数据集文件
df = pd.read_excel('作业1_缺失值已填充.xlsx')
#计算每个数字维度的四分位数
print("观测窗口总基本积分")
print("上四分位数为："+str(df.iloc[:,2].quantile(0.25)))
print("下四分位数为："+str(df.iloc[:,2].quantile(0.75)))
print("\n第二年总票价")
print("上四分位数为："+str(df.iloc[:,3].quantile(0.25)))
print("下四分位数为："+str(df.iloc[:,3].quantile(0.75)))
print("\n观测窗口总飞行公里数")
print("上四分位数为："+str(df.iloc[:,4].quantile(0.25)))
print("下四分位数为："+str(df.iloc[:,4].quantile(0.75)))
print("\n观测窗口总加权飞行公里数（Σ舱位折扣×航段距离）")
print("上四分位数为："+str(df.iloc[:,5].quantile(0.25)))
print("下四分位数为："+str(df.iloc[:,5].quantile(0.75)))
print("\n观测窗口季度平均基本积分累积")
print("上四分位数为："+str(df.iloc[:,6].quantile(0.25)))
print("下四分位数为："+str(df.iloc[:,6].quantile(0.75)))

方法二：

df = pd.read_excel('作业1_缺失值已填充.xlsx')
print(df.describe())

盒图：

盒图采用python相关绘图包matplotlib.pyplot绘制
结果分析：数据主要集中在[0，25000]区间中，数据分布不均匀，存在个别离群点。

import matplotlib.pyplot as plt
from pylab import *
mpl.rcParams['font.sans-serif']=['SimHei']

df = pd.read_excel('作业1_缺失值已填充.xlsx')
df.plot.box(title="航空公司客户数据")
plt.grid(linestyle="--", alpha=0.3)
plt.show()

在这里插入图片描述

做出每个数字维度的直方图、分位数图、散布图。
直方图：
结果分析：数据主要集中在0-50000之间，数据分布不均匀，各属性数据区间分布大体相似。

df = pd.read_excel('作业1_缺失值已填充.xlsx')
plt.hist(df.iloc[:,6], bins=[0,50000,100000,150000,250000])
# 显示横轴标签
plt.xlabel("数值区间")
# 显示纵轴标签
plt.ylabel("次数")
# 显示图标题
plt.title("观测窗口季度平均基本积分累积直方图")
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

在这里插入图片描述

分位数图：
结果分析：数据主要集中在0-20000，分布不均匀

df = pd.read_excel('作业1_缺失值已填充.xlsx')
plt.scatter((np.arange(2000)+1)/2000,df.iloc[:,2].sort_values(),s=0.5)
x_major_locator=MultipleLocator(0.25)
ax=plt.gca()
ax.xaxis.set_major_locator(x_major_locator)
plt.xlim(0,1)
# 显示图标题
plt.text(0.25,df.iloc[:,2].sort_values()[24],"Q1",color="r")
plt.text(0.50,df.iloc[:,2].sort_values()[49],"中位数",color="r")
plt.text(0.75,df.iloc[:,2].sort_values()[74],"Q3",color="r")
plt.title("观测窗口总基本积分累积分位数图")
plt.xlabel("f-值")#横坐标名字
plt.ylabel("数据")#纵坐标名字
plt.show()

在这里插入图片描述

散布图：
可见数据集中在0-50000之间，存在小部分离群点

df = pd.read_excel('作业1_缺失值已填充.xlsx')
plt.scatter(np.arange(2000),df.iloc[:,2],edgecolor='blue',s=2)
# 显示纵轴标签
plt.ylabel("观测窗口季度平均基本积分累积")
# 显示图标题
plt.title("观测窗口季度平均基本积分累积散布图")
plt.axis([0,2000,0,300000])
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

在这里插入图片描述

按各个属性对数据进行最小-最大规范化和z-score规范化。
使用python语言，利用以下公式编写代码对数据进行最小-最大规范化和z-score规范化。
最小-最大规范化：

#最小-最大规范化
a1=(df.iloc[:,2] - df.iloc[:,2].min())/(df.iloc[:,2].max() - df.iloc[:,2].min())
print("总基本积分最小-最大规范化："+str(a1))
a2=(df.iloc[:,3] - df.iloc[:,3].min())/(df.iloc[:,3].max() - df.iloc[:,3].min())
print("第二年总票价最小-最大规范化："+str(a2))
a3=(df.iloc[:,4] - df.iloc[:,4].min())/(df.iloc[:,4].max() - df.iloc[:,4].min())
print("总飞行公里数最小-最大规范化："+str(a3))
a4=(df.iloc[:,5] - df.iloc[:,5].min())/(df.iloc[:,5].max() - df.iloc[:,5].min())
print("总加权飞行公里数最小-最大规范化："+str(a4))
a5=(df.iloc[:,6] - df.iloc[:,6].min())/(df.iloc[:,6].max() - df.iloc[:,6].min())
print("季度平均基本积分累积最小-最大规范化："+str(a5))

由于数据太多，省略显示如下：
在这里插入图片描述
z-score规范化：

#零-均值规范化
b1=(df.iloc[:,2] - df.iloc[:,2].mean())/df.iloc[:,2].std()
print("总基本积分z-score规范化："+str(b1))
b2=(df.iloc[:,3] - df.iloc[:,3].mean())/df.iloc[:,3].std()
print("第二年总票价z-score规范化："+str(b2))
b3=(df.iloc[:,4] - df.iloc[:,4].mean())/df.iloc[:,4].std()
print("总飞行公里数z-score规范化："+str(b3))
b4=(df.iloc[:,5] - df.iloc[:,5].mean())/df.iloc[:,5].std()
print("总加权飞行公里数z-score规范化："+str(b4))
b5=(df.iloc[:,6] - df.iloc[:,6].mean())/df.iloc[:,6].std()
print("季度平均基本积分累积z-score规范化："+str(b5))