Python数据分析基础【图表篇】

最新推荐文章于 2024-08-27 14:11:53 发布

ai-jack

最新推荐文章于 2024-08-27 14:11:53 发布

阅读量1.1k

点赞数 2

文章标签： python 数据分析数据可视化

本文链接：https://blog.csdn.net/ajc_zh/article/details/105694202

版权

参考《Python数据分析》一书

图表篇目录

matplotlib画图

如果使用的是中文标签，就会发现中文标签无法正常显示。这是由于Matplotlib的默认字体是英文字体所致，解决它的办法是在作图之前手动指定默认字体为中文字体，如黑体(SimHei)。

plt.rcParams['font.sans-serif'] = ['SimHei'] # 这两句用来正常显示中文标签

另外，保存作图图像时，负号有可能显示不正常，可以通过以下代码解决:

plt. rcParams['axes.unicode_ minus'] = False # 解决保存图像是负号'-'显示为方块的问题

1、条形图

条形图表示一组分类数值，比如计数值。

import matplotlib.pyplot as plt #惯常的import 语句
plt.style.use('ggplot') #使用ggplot 样式表来模拟ggplot2 风格的图形，ggplot2 是一个常用的R 语言绘图包
customers = ['ABC', 'DEF', 'GHI', 'JKL', 'MNO'] #创建了一个客户索引列表，因为xticks 函数在设置标签时要求索引位置和标签值
customers_index = range(len(customers))
sale_amounts = [127, 90, 201, 111, 232]
fig = plt.figure()	# 使用matplotlib 绘图时，首先要创建一个基础图
ax1 = fig.add_subplot(1,1,1) #在基础图中创建一个或多个子图
ax1.bar(customers_index, sale_amounts, align='center', color='darkblue')
#创建条形图。customer_index 设置条形左侧在x 轴上的坐标。
#sale_amounts 设置条形的高度。align='center' 设置条形与标签中间对齐。color='darkblue' 设置条形的颜色
ax1.xaxis.set_ticks_position('bottom')
ax1.yaxis.set_ticks_position('left')
# 设置刻度线位置在x 轴底部和y 轴左侧，使图形的上部和右侧不显示刻度线

# 将条形的刻度线标签由客户索引值更改为实际的客户名称
plt.xticks(customers_index, customers, rotation=0, fontsize='small')
# rotation=0 表示刻度标签应该是水平的，而不是倾斜一个角度。fontsize='small' 将刻度标签的字体设为小字体
# 向图中添加x 轴标签、y 轴标签和图形标题
plt.xlabel('Customer Name')
plt.ylabel('Sale Amount')
plt.title('Sale Amount per Customer')
# 将统计图保存在当前文件夹中，文件名为bar_plot.png
# dpi=400 设置图形分辨率 [ 每英寸（1 英寸=2.54 厘米）的点数）]，bbox_inches='tight' 表示在保存图形时，将图形四周的空白部分去掉。
plt.savefig('bar_plot.png', dpi=400, bbox_inches='tight')
plt.show() # 在一个新窗口中显示统计图

在这里插入图片描述

2、直方图

直方图用来表示数值分布。常用的直方图包括频率分布、频率密度分布、概率分布和概率密度分布。

#!/usr/bin/env python3
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
mu1, mu2, sigma = 100, 130, 15
# 创建两个正态分布变量x1 和x2
# x1 的均值是100，x2 的均值是130，所以两个分布会有一些重叠，但不会是一个覆盖掉另一个
x1 = mu1 + sigma*np.random.randn(10000)
x2 = mu2 + sigma*np.random.randn(10000)
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
# 为这两个变量创建两个柱形图，或称频率分布图
# bins=50 表示每个变量的值应该被分成50 份。normed=False 表示直方图显示的是频率分布，而不是概率密度。第一个直方图是暗绿色，第二个直方图是橙色。alpha=0.5 表示第二个直方图应该是透明的，这样我们就可以看到两个直方图重叠部分的暗绿色图。
n, bins, patches = ax1.hist(x1, bins=50, normed=False, color='darkgreen')
n, bins, patches = ax1.hist(x2, bins=50, normed=False, color='orange', alpha=0.5)
ax1.xaxis.set_ticks_position('bottom')
ax1.yaxis.set_ticks_position('left')
plt.xlabel('Bins')
plt.ylabel('Number of Values in Bin')
# 为基础图添加一个居中的标题，字体大小为14，粗体
fig.suptitle('Histograms', fontsize=14, fontweight='bold')
# 为子图添加一个居中的标题，位于基础图标题下面
ax1.set_title('Two Frequency Distributions')
plt.savefig('histogram.png', dpi=400, bbox_inches='tight')
plt.show()

在这里插入图片描述

3、折线图

折线图中的数值点在一条折线上。它通常用来表示数据随着时间发生的变化。

from numpy.random import randn
import matplotlib.pyplot as plt
plt.style.use('ggplot')
# 使用randn 创建绘图所用的随机数据
plot_data1 = randn(50).cumsum()
plot_data2 = randn(50).cumsum()
plot_data3 = randn(50).cumsum()
plot_data4 = randn(50).cumsum()
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
# 创建4条折线
ax1.plot(plot_data1, marker=r'o', color=u'blue', linestyle='-',label='Blue Solid')
ax1.plot(plot_data2, marker=r'+', color=u'red', linestyle='--',label='Red Dashed')
ax1.plot(plot_data3, marker=r'*', color=u'green', linestyle='-.',label='Green Dash Dot')
ax1.plot(plot_data4, marker=r's', color=u'orange', linestyle=':',label='Orange Dotted')
ax1.xaxis.set_ticks_position('bottom')
ax1.yaxis.set_ticks_position('left')
ax1.set_title('Line Plots: Markers, Colors, and Linestyles')
plt.xlabel('Draw')
plt.ylabel('Random Number')
# 为统计图创建图例,loc='best' 指示matplotlib 根据图中的空白部分将图例放在最合适的位置
plt.legend(loc='best')
plt.savefig('line_plot.png', dpi=400, bbox_inches='tight')
plt.show()

在这里插入图片描述

4、散点图

散点图表示两个数值变量之间的相对关系，这两个变量分别位于两个数轴上。例如，身高与体重，或者供给与需求。散点图有助于识别出变量之间是否具有正相关或负相关。

import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
x = np.arange(start=1., stop=15., step=1.)
# 通过随机数使数据与一条直线和一条二次曲线稍稍偏离
y_linear = x + 5. * np.random.randn(14)
y_quadratic = x**2 + 10. * np.random.randn(14)
# 使用numpy 的polyfit 函数通过两组数据点(x, y_linear) 和(x, y_quadratic) 拟合出一条直线和一条二次曲线，
# 再使用poly1d 函数根据直线和二次曲线的参数生成一个线形方程和二次方程
fn_linear = np.poly1d(np.polyfit(x, y_linear, deg=1))
print(np.polyfit(x, y_linear, deg=1)) # [ 1.20914815 -0.39711886]
print(fn_linear) # 1.209 x - 0.3971
fn_quadratic = np.poly1d(np.polyfit(x, y_quadratic, deg=2))
print(np.polyfit(x, y_quadratic, deg=2))# [  0.61083443   6.11542735 -18.38511783]
print(fn_quadratic) # 0.6108 x^2 + 6.115 x - 18.39
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
# 创建带有两条回归曲线的散点图,'bo' 表示(x, y_linear）点是蓝色圆圈，'go' 表示(x, y_quadratic) 点是绿色圆圈
# 'b-' 表示(x, y_linear) 点之间的线是一条蓝色实线，'g-' 表示(x, y_quadratic) 点之间的线是一条绿色实线。linewidth可以设置线的宽度
ax1.plot(x, y_linear, 'bo', x, y_quadratic, 'go', x, fn_linear(x), 'b-', x, fn_quadratic(x), 'g-', linewidth=2.)
ax1.xaxis.set_ticks_position('bottom')
ax1.yaxis.set_ticks_position('left')
ax1.set_title('Scatter Plots Regression Lines')
plt.xlabel('x')
plt.ylabel('f(x)')
# 设置了x 轴和y 轴的范围
plt.xlim((min(x)-1., max(x)+1.))
plt.ylim((min(y_quadratic)-10., max(y_quadratic)+10.))
plt.savefig('scatter_plot.png', dpi=400, bbox_inches='tight')
plt.show()

在这里插入图片描述

5、箱线图

箱线图可以表示出数据的最小值、第一四分位数、中位数、第三四分位数和最大值。箱体的下部和上部边缘线分别表示第一四分位数和第三四分位数，箱体中间的直线表示中位数。箱体上下两端延伸出去的直线（whisker，亦称为“须”）表示非离群点的最小值和最大值，在直线（须）之外的点表示离群点。

import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
N = 500
#  生成正态分布的概率密度随机数
normal = np.random.normal(loc=0.0, scale=1.0, size=N)
# loc：float
#     此概率分布的均值（对应着整个分布的中心centre）
# scale：float
#     此概率分布的标准差（对应于分布的宽度，scale越大越矮胖，scale越小，越瘦高）
# size：int or tuple of ints
#     输出的shape，默认为None，只输出一个值
# 例如：nd1 = np.random.normal(loc=1,scale=2,size=2)结果为：array([-0.46982446, -1.28956852])

# 对数正态分布
lognormal = np.random.lognormal(mean=0.0, sigma=1.0, size=N)
index_value = np.random.random_integers(low=0, high=N-1, size=N) #int类型的随机整数，介于low和high之间
normal_sample = normal[index_value]
lognormal_sample = lognormal[index_value]
box_plot_data = [normal,normal_sample,lognormal,lognormal_sample]
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
box_labels = ['normal','normal_sample','lognormal','lognormal_sample'] # 保存每个箱线图的标签,下一行代码的boxplot函数中使用这个列表
# 使用boxplot 函数创建4 个箱线图
# notch=False 表示箱体是矩形，而不是在中间收缩
# ym='.' 表示离群点（就是直线之外的点）使用圆点，而不是默认的+符号
# vert=True 表示箱体是垂直的，不是水平的
# whis=1.5 设定了直线从第一四分位数和第三四分位数延伸出的范围,（例：Q3+whis*IQR，IQR就是四分位距，等于Q3-Q1)
# showmeans=True 表示箱体在显示中位数的同时也显示均值
# labels=box_labels 表示使用box_labels 中的值来标记箱线图
ax1.boxplot(box_plot_data, notch=False, sym='.', vert=True, whis=1.5, \
  showmeans=True, labels=box_labels)
ax1.xaxis.set_ticks_position('bottom')
ax1.yaxis.set_ticks_position('left')
ax1.set_title('Box Plots: Resampling of Two Distributions')
ax1.set_xlabel('Distribution')
ax1.set_ylabel('Value')
plt.savefig('box_plot.png', dpi=400, bbox_inches='tight')
plt.show()

在这里插入图片描述

pandas画图

pandas 通过提供一个可以作用于序列和数据框的函数plot，简化了基于序列和数据框中的数据创建图表的过程。plot 函数默认创建折线图，还可以通过设置参数kind 创建其他类型的图表。
比如六边箱图（hexagonal bin plot）、矩阵散点图、密度图、Andrews 曲线图、平行坐标图、延迟图、自相关图和自助抽样图。

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
# 创建了一个基础图和两个并排放置的子图
fig, axes = plt.subplots(nrows=1, ncols=2)
# 使用ravel 函数将两个子图分别赋给两个变量ax1 和ax2，就不必使用行和列的索引（例axes[0,0]和axes[0,1]）来引用子图
ax1, ax2 = axes.ravel()
data_frame = pd.DataFrame(np.random.rand(5, 3),
index=['Customer 1', 'Customer 2', 'Customer 3', 'Customer 4', 'Customer 5'],
columns=pd.Index(['Metric 1', 'Metric 2', 'Metric 3'], name='Metrics'))
# print(data_frame)
# Metrics     Metric 1  Metric 2  Metric 3
# Customer 1  0.322053  0.232865  0.041368
# Customer 2  0.008982  0.741461  0.515989
# Customer 3  0.509157  0.063601  0.915268
# Customer 4  0.965545  0.959176  0.408061
# Customer 5  0.572430  0.730670  0.833869

# 在左侧子图中创建了一个条形图
data_frame.plot(kind='bar', ax=ax1, alpha=0.75, title='Bar Plot')
# 使用matplotlib的函数来设置x轴和y轴标签的旋转角度和字体大小
plt.setp(ax1.get_xticklabels(), rotation=45, fontsize=10)
plt.setp(ax1.get_yticklabels(), rotation=0, fontsize=10)
ax1.set_xlabel('Customer')
ax1.set_ylabel('Value')
ax1.xaxis.set_ticks_position('bottom')
ax1.yaxis.set_ticks_position('left')
# 为箱线图单独创建了一个颜色字典
colors = dict(boxes='DarkBlue', whiskers='Gray', medians='Red', caps='Black')
# print(colors)
# {'boxes': 'DarkBlue', 'whiskers': 'Gray', 'medians': 'Red', 'caps': 'Black'}

# 在右侧子图中创建了一个箱线图，使用colors 变量为箱线图各部分着色，并将离群点的形状设置为红色圆点
data_frame.plot(kind='box', color=colors, sym='r.', ax=ax2, title='Box Plot')
plt.setp(ax2.get_xticklabels(), rotation=45, fontsize=10)
plt.setp(ax2.get_yticklabels(), rotation=0, fontsize=10)
ax2.set_xlabel('Metric')
ax2.set_ylabel('Value')
ax2.xaxis.set_ticks_position('bottom')
ax2.yaxis.set_ticks_position('left')
plt.savefig('pandas_plots.png', dpi=400, bbox_inches='tight')
plt.show()

在这里插入图片描述

ggplot

ggplot 是一个Python 绘图包，它基于R 语言的ggplot2 包和图形语法。ggplot 与其他绘图包的关键区别是它的语法将数据与实际绘图明确地分离开来。

#!/usr/bin/env python3
from ggplot import *
print(mtcars.head())
plt1 = ggplot(aes(x='mpg'), data=mtcars) +\
geom_histogram(fill='darkblue', binwidth=2) +\
xlim(10, 35) + ylim(0, 10) +\
xlab("MPG") + ylab("Frequency") +\
ggtitle("Histogram of MPG") +\
theme_matplotlib()
print(plt1)
print(meat.head())
plt2 = ggplot(aes(x='date', y='beef'), data=meat) +\
geom_line(color='purple', size=1.5, alpha=0.75) +\
stat_smooth(colour='blue', size=2.0, span=0.15) +\
xlab("Year") + ylab("Head of Cattle Slaughtered") +\
ggtitle("Beef Consumption Over Time") +\
theme_seaborn()
print(plt2)
print(diamonds.head())
plt3 = ggplot(diamonds, aes(x='carat', y='price', colour='cut')) +\
geom_point(alpha=0.5) +\
scale_color_gradient(low='#05D9F6', high='#5011D1') +\
xlim(0, 6) + ylim(0, 20000) +\
xlab("Carat") + ylab("Price") +\
ggtitle("Diamond Price by Carat and Cut") +\
theme_gray()
print(plt3)
ggsave(plt3, "ggplot_plots.png")

在这里插入图片描述

seaborn画图

seaborn 简化了在Python 中创建信息丰富的统计图表的过程。它是在matplotlib 基础上开发的，支持numpy 和pandas 中的数据结构，并集成了scipy 和statsmodels 中的统计程序。
eaborn 可以创建标准统计图，包括直方图、密度图、条形图、箱线图和散点图。它可以对成对变量之间的相关性、线性与非线性回归模型以及统计估计的不确定性进行可视化。它可以用来在评估变量时检查变量之间的关系，并可以建立统计图矩阵来显示复杂的关系。

import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pylab import savefig
sns.set(color_codes=True)
# 直方图
x = np.random.normal(size=100)
# 设置包括数据封箱个数、是否显示高斯核密度估计（kde）、在支撑轴上显示地毯图、创建坐标轴标签和标题，以及为图例生成的标签
sns.distplot(x, bins=20, kde=False, rug=True, label="Histogram wo Density")
sns.utils.axlabel("Value", "Frequency")
plt.title("Histogram of a Random Sample from a Normal Distribution")
plt.legend()

# 带有回归直线的散点图与单变量直方图
mean, cov = [5, 10], [(1, .5), (.5, 1)]
data = np.random.multivariate_normal(mean, cov, 200)
data_frame = pd.DataFrame(data, columns=["x", "y"])
# 使用jointplot 函数显示两个变量的一张散点图
# 其中带有一条回归直线，并为每个变量生成一张直方图
sns.jointplot(x="x", y="y", data=data_frame, kind="reg").set_axis_labels("x", "y")
plt.suptitle("Joint Plot of Two Variables with Bivariate and Univariate Graphs")
plt.show()

# 成对变量之间的散点图与单变量直方图
iris = sns.load_dataset("iris")
# 使用pairplot 函数生成数据集中每两个变量之间的双变量散点图，并为每个变量生成一张直方图
sns.pairplot(iris)

# 按照某几个变量生成的箱线图
tips = sns.load_dataset("tips")
# 使用factorplot 函数生成的箱线图，表示两个变量之间的关系
sns.factorplot(x="time", y="total_bill", hue="smoker",\
    col="day", data=tips, kind="box", size=4, aspect=.5)

# 带有bootstrap置信区间的线性回归模型
sns.lmplot(x="total_bill", y="tip", data=tips)

# 带有bootstrap置信区间的逻辑斯蒂回归模型
#seaborn
tips["big_tip"] = (tips.tip / tips.total_bill) > .15
sns.lmplot(x="total_bill", y="big_tip", data=tips, logistic=True, y_jitter=.03)\
    .set_axis_labels("Total Bill", "Big Tip")
plt.title("Logistic Regression of Big Tip vs. Total Bill")
plt.show()
savefig("seaborn_plots.png")