数据基本信息的探索
第一步:根据数据的实际情况的掌握(数据的空值数量,最大值,最小值,四分位等等)
# 对数据进行基本的探索
import pandas as pd
# 第一行为标签
data_file = '读取文件地址'
# 数据探索结果表
result_file = '.结果存入地址'
# 需要用文本编辑器将数据转换为UTF-8编码
data = pd.read_csv(data_file, encoding='utf-8')
# 包括对数据的基本描述使用discribe()函数;percentiles 参数是指定计算多少的分位数表
# .T 转置
explore = data.describe(percentiles=[], include='all').T
# describe()函数自动计算非空值数,所以需要手动计算空值数
explore['null'] = len(data) - explore['count']
# 输出中文有问题,所以改成了英文。原来是:explore = explore[[u'空值数', u'最大值',u '最小值']]
explore = explore[['null', 'max', 'min']]
# 表头重命名
explore.columns = ['null_num', 'max', 'min']
# 导出结果
explore.to_csv(result_file)
在代码中只是导出了缺失值、最大值和最小值,对数据的观测还需进一步的观察。
通常用画图的方式更为直观,其次在文章的最后补上各个颜色的名字,便于配色。
第二步:绘制各个属性的图片,更为直观的观察数据
第一类:直方图
data_file = '数据文件位置'
# 读取
data = pd.read_csv(data_file, encoding='utf-8')
# 取出日期字段,并规范化
ffp = data['FFP_DATE'].apply(lambda x: datetime.strptime(x, '%Y/%m/%d'))
# 读出年份
ffp_year = ffp.map(lambda x: x.year)
# 绘制各年份会员入会人数直方图
# 设置画布大小
fig = plt.figure(figsize=(8, 5))
# 设置字体以及字符显示
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
plt.hist(ffp_year, bins='auto', color='#0504aa')
plt.xlabel('年份')
plt.ylabel('入会人数')
plt.title('各年份会员入会人数')
plt.show()
plt.close
第二类:饼图
# 查看性别比例
male = pd.value_counts(data['GENDER'])['男']
female = pd.value_counts(data['GENDER'])['女']
# 绘制会员性别比例饼图
fig = plt.figure(figsize=(7, 4))
plt.pie([male, female], labels=['男', '女'], colors=['lightskyblue', 'lightpink'], autopct='%1.1f%%')
plt.title('标题')
plt.show()
plt.close()
第三类:条形图
# 提取不同级别会员的人数
lv_four = pd.value_counts(data['FFP_TIER'])[4]
lv_five = pd.value_counts(data['FFP_TIER'])[5]
lv_six = pd.value_counts(data['FFP_TIER'])[6]
# 绘制会员各级别人数条形图
fig = plt.figure(figsize=(8, 5))
plt.bar(x=range(3), height=[lv_four, lv_five, lv_six], width=0.4, alpha=0.8, color='hotpink')
plt.xticks([index for index in range(3)], [4, 5, 6])
plt.xlabel('会员等级')
plt.ylabel('会员人数')
plt.title('会员各级别人数')
plt.show()
plt.close()
第四类:箱型图
# 提取会员年龄
print(data['AGE'])
# dropna()可用于删除空值,也可以设置值,删除符合条件的数据。通常用于清洗空值、异常值等用途
age = data['AGE'].dropna()
age = age.astype('int64')
# 绘制会员年龄分布箱型图
fig = plt.figure(figsize=(5, 10))
plt.boxplot(age, patch_artist=True, labels=['会员年龄'],
# 设置填充颜色
boxprops={'facecolor':'lightblue'})
plt.title('会员年龄分布箱型图')
# 显示y坐标轴的底线
plt.grid(axis='y')
plt.show()
plt.close()
第三步:观察数据的相关性(个人认为可以放在异常数据清洗后面)
本部分采用的是热力图来查看数据之间的相关性。
1.找出所需要观察的属性整合成一个新的组合
2.计算相关性矩阵
(相关性矩阵分析:每个位置的取值范围为[-1, 1];对角线为A与A本身的关系均为1;数值大于0时,为正相关也就是A会与B同方向变动;数值小于0时,为负相关,也就是A与B反方向变动;数值等于0时,则表示不相关)
3.绘制热力图
# 相关性分析
# 提取属性合并为新数据集
data_corr = data[['FFP_TIER', 'FLIGHT_COUNT', 'LAST_TO_END', 'SEG_KM_SUM', 'EXCHANGE_COUNT', 'Points_Sum']]
# fillna,针对缺失值进行填充:fillna(0)填充为0也可以用字典填充
# 其次fillna(0,inplace=True)中inplace 的参数True代表在直接修改原对象;而False则是创建副本,修改副本
age1 = data['AGE'].fillna(0)
print(data_corr)
print(age1.astype('int64')[2])
data_corr.loc['AGE'] = age1.astype('int64')
# 添加字段(因为year字段是经过处理的所以不从原数据提取)
data_corr['ffp_year'] = ffp_year
# 计算相关性矩阵
dt_corr = data_corr.corr(method='pearson')
print('相关性矩阵为:\n{0}'.format(dt_corr))
# 绘制热力图
import seaborn as sns
# 横纵坐标值一致,所以设置画布为正方形画布
plt.subplots(figsize=(9, 9))
sns.heatmap(dt_corr, annot=True, vmax=1, square=True, cmap='Blues')
plt.show()
plt.close()
注:在添加新列的过程中有报warning,emmmm试了各样的方法还没有解决掉QAQ,但是结果还是正确的,希望有高手可以告知~蟹蟹
注释:数据以及项目背景均来自书《Python数据分析与挖掘实战(第二版)》
附录一:plt.rcParams属性总结
plt.rcParams[’font.sans-serif’] = ‘SimHei’ # 设置字体
plt.rcParams[’axes.unicode_minus’] = False # 字符显示
plt.rcParams[’lines.linewidth’] = 3 # 线条宽度
plt.rcParams[’lines.linestyle’] = ‘-.’ # 线条样式
plt.rcParams[’lines.color’] = ‘blue’ # 线条颜色
plt.rcParams[’lines.marker’] = None # 默认标记
plt.rcParams[’lines.markersize’] = 6 # 标记大小
#x轴,y轴的字体大小
plt.rcParams[’xtick.labelsize’]
plt.rcParams[’ytick.labelsize’]
#x轴,y轴的最大刻度
plt.rcParams[’xtick.major.size’]
plt.rcParams[’ytick.major.size’]
plt.rcParams[’axes.titlesize’] # 子图的标题大小
plt.rcParams[’axes.labelsize’] # 子图的标签大小
plt.rcParams[’figure.dpi’] # 图像分辨率
plt.rcParams[’figure.figsize’] # 图像显示大小
plt.rcParams[’savefig.dpi’] # 图片像素
附录二:颜色目录
颜色 | |
---|---|
‘aliceblue’ | ‘#F0F8FF’, |
‘antiquewhite’ | ‘#FAEBD7’, |
‘aqua’ | ‘#00FFFF’, |
‘aquamarine’ | ‘#7FFFD4’, |
‘azure’ | ‘#F0FFFF’, |
‘beige’ | ‘#F5F5DC’, |
‘bisque’ | ‘#FFE4C4’, |
‘black’ | ‘#000000’, |
‘blanchedalmond’ | ‘#FFEBCD’, |
‘blue’ | ‘#0000FF’, |
‘blueviolet’ | ‘#8A2BE2’, |
‘brown’ | ‘#A52A2A’, |
‘burlywood’ | ‘#DEB887’, |
‘cadetblue’ | ‘#5F9EA0’, |
‘chartreuse’ | ‘#7FFF00’, |
‘chocolate’ | ‘#D2691E’, |
‘coral’ | ‘#FF7F50’, |
‘cornflowerblue’ | ‘#6495ED’, |
‘cornsilk’ | ‘#FFF8DC’, |
‘crimson’ | ‘#DC143C’, |
‘cyan’ | ‘#00FFFF’, |
‘darkblue’ | ‘#00008B’, |
‘darkcyan’ | ‘#008B8B’, |
‘darkgoldenrod’ | ‘#B8860B’, |
‘darkgray’ | ‘#A9A9A9’, |
‘darkgreen’ | ‘#006400’, |
‘darkkhaki’ | ‘#BDB76B’, |
‘darkmagenta’ | ‘#8B008B’, |
‘darkolivegreen’ | ‘#556B2F’, |
‘darkorange’ | ‘#FF8C00’, |
‘darkorchid’ | ‘#9932CC’, |
‘darkred’ | ‘#8B0000’, |
‘darksalmon’ | ‘#E9967A’, |
‘darkseagreen’ | ‘#8FBC8F’, |
‘darkslateblue’ | ‘#483D8B’, |
‘darkslategray’ | ‘#2F4F4F’, |
‘darkturquoise’ | ‘#00CED1’, |
‘darkviolet’ | ‘#9400D3’, |
‘deeppink’ | ‘#FF1493’, |
‘deepskyblue’ | ‘#00BFFF’, |
‘dimgray’ | ‘#696969’, |
‘dodgerblue’ | ‘#1E90FF’, |
‘firebrick’ | ‘#B22222’, |
‘floralwhite’ | ‘#FFFAF0’, |
‘forestgreen’ | ‘#228B22’, |
‘fuchsia’ | ‘#FF00FF’, |
‘gainsboro’ | ‘#DCDCDC’, |
‘ghostwhite’ | ‘#F8F8FF’, |
‘gold’ | ‘#FFD700’, |
‘goldenrod’ | ‘#DAA520’, |
‘gray’ | ‘#808080’, |
‘green’ | ‘#008000’, |
‘greenyellow’ | ‘#ADFF2F’, |
‘honeydew’ | ‘#F0FFF0’, |
‘hotpink’ | ‘#FF69B4’, |
‘indianred’ | ‘#CD5C5C’, |
‘indigo’ | ‘#4B0082’, |
‘ivory’ | ‘#FFFFF0’, |
‘khaki’ | ‘#F0E68C’, |
‘lavender’ | ‘#E6E6FA’, |
‘lavenderblush’ | ‘#FFF0F5’, |
‘lawngreen’ | ‘#7CFC00’, |
‘lemonchiffon’ | ‘#FFFACD’, |
‘lightblue’ | ‘#ADD8E6’, |
‘lightcoral’ | ‘#F08080’, |
‘lightcyan’ | ‘#E0FFFF’, |
‘lightgoldenrodyellow’ | ‘#FAFAD2’, |
‘lightgreen’ | ‘#90EE90’, |
‘lightgray’ | ‘#D3D3D3’, |
‘lightpink’ | ‘#FFB6C1’, |
‘lightsalmon’ | ‘#FFA07A’, |
‘lightseagreen’ | ‘#20B2AA’, |
‘lightskyblue’ | ‘#87CEFA’, |
‘lightslategray’ | ‘#778899’, |
‘lightsteelblue’ | ‘#B0C4DE’, |
‘lightyellow’ | ‘#FFFFE0’, |
‘lime’ | ‘#00FF00’, |
‘limegreen’ | ‘#32CD32’, |
‘linen’ | ‘#FAF0E6’, |
‘magenta’ | ‘#FF00FF’, |
‘maroon’ | ‘#800000’, |
‘mediumaquamarine’ | ‘#66CDAA’, |
‘mediumblue’ | ‘#0000CD’, |
‘mediumorchid’ | ‘#BA55D3’, |
‘mediumpurple’ | ‘#9370DB’, |
‘mediumseagreen’ | ‘#3CB371’, |
‘mediumslateblue’ | ‘#7B68EE’, |
‘mediumspringgreen’ | ‘#00FA9A’, |
‘mediumturquoise’ | ‘#48D1CC’, |
‘mediumvioletred’ | ‘#C71585’, |
‘midnightblue’ | ‘#191970’, |
‘mintcream’ | ‘#F5FFFA’, |
‘mistyrose’ | ‘#FFE4E1’, |
‘moccasin’ | ‘#FFE4B5’, |
‘navajowhite’ | ‘#FFDEAD’, |
‘navy’ | ‘#000080’, |
‘oldlace’ | ‘#FDF5E6’, |
‘olive’ | ‘#808000’, |
‘olivedrab’ | ‘#6B8E23’, |
‘orange’ | ‘#FFA500’, |
‘orangered’ | ‘#FF4500’, |
‘orchid’ | ‘#DA70D6’, |
‘palegoldenrod’ | ‘#EEE8AA’, |
‘palegreen’ | ‘#98FB98’, |
‘paleturquoise’ | ‘#AFEEEE’, |
‘palevioletred’ | ‘#DB7093’, |
‘papayawhip’ | ‘#FFEFD5’, |
‘peachpuff’ | ‘#FFDAB9’, |
‘peru’ | ‘#CD853F’, |
‘pink’ | ‘#FFC0CB’, |
‘plum’ | ‘#DDA0DD’, |
‘powderblue’ | ‘#B0E0E6’, |
‘purple’ | ‘#800080’, |
‘red’ | ‘#FF0000’, |
‘rosybrown’ | ‘#BC8F8F’, |
‘royalblue’ | ‘#4169E1’, |
‘saddlebrown’ | ‘#8B4513’, |
‘salmon’ | ‘#FA8072’, |
‘sandybrown’ | ‘#FAA460’, |
‘seagreen’ | ‘#2E8B57’, |
‘seashell’ | ‘#FFF5EE’, |
‘sienna’ | ‘#A0522D’, |
‘silver’ | ‘#C0C0C0’, |
‘skyblue’ | ‘#87CEEB’, |
‘slateblue’ | ‘#6A5ACD’, |
‘slategray’ | ‘#708090’, |
‘snow’ | ‘#FFFAFA’, |
‘springgreen’ | ‘#00FF7F’, |
‘steelblue’ | ‘#4682B4’, |
‘tan’ | ‘#D2B48C’, |
‘teal’ | ‘#008080’, |
‘thistle’ | ‘#D8BFD8’, |
‘tomato’ | ‘#FF6347’, |
‘turquoise’ | ‘#40E0D0’, |
‘violet’ | ‘#EE82EE’, |
‘wheat’ | ‘#F5DEB3’, |
‘white’ | ‘#FFFFFF’, |
‘whitesmoke’ | ‘#F5F5F5’, |
‘yellow’ | ‘#FFFF00’, |
‘yellowgreen’ | ‘#9ACD32’ |