2分钟学会python数据分析与机器学习知识点(三)
第四节、Matplotlib
1、matplotlib基本操作
1.1 pycharm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#首先画一个简单的图
#%matplotlib inline 在notebook里面需要执行这样一个东西不用调用show方法
#常用颜色
#蓝色b 绿色g 红色r 青色c 品红m 黄色y 黑色k 白色w
#常用折线图形状
#- 实线 --虚线 -.虚点线 .点线 x乘号线 d菱形线 D实心菱形 +加号线 p五角线
#2上叉线 s正方点 h六边形点1 H六边形点2 o圆点 *星形 ,像素点 :点线
# plt.plot([1,2,3,4,5],[1,16,81,256,512],'r-.')
# plt.xlabel('x')
# plt.ylabel('y')
# plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
# plt.title('最基本的一个图')
# plt.xlabel('round',fontsize=24)
# plt.ylabel('value',fontsize=24)
#
# plt.show()
#
# numpy=np.arange(0,10,0.5)
# plt.plot(numpy,numpy,'r-')
# plt.plot(numpy,numpy**2,'c-')
# plt.plot(numpy,numpy**3,'gD')
# plt.show()
x=np.linspace(-10,10)
y=np.sin(x)
# plt.plot(x,y,'g-',linewidth=3.0,marker='o',markerfacecolor='r',markersize=10,alpha=1)
# plt.show()
#实线子图
#211表示一会要画的图是两行一列的 121 211 322
# plt.subplot(211)
# plt.plot(x,y,'r')
# plt.subplot(212)
# plt.plot(x,y,'b')
# plt.show()
#
# 用来正常显示中文标签
# 用来正常显示负号
plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False
plt.plot(x,y,'r-')
plt.title('绘图')
plt.xlabel('x:横坐标',fontsize=14)
plt.ylabel('y:纵坐标',fontsize=14)
#加上格子
plt.grid(True)
plt.show()
2、风格设置和条形图
2.1 pycharm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#输出有多少种分风格
print(plt.style.available)
#构造
# x=np.linspace(-10,10)
# y=np.sin(x)
# plt.style.use('ggplot')
# plt.plot(x,y,'g-',linewidth=3.0,marker='o',markerfacecolor='r',markersize=10,alpha=1)
# plt.show()
3、条形图
3.1 pycharm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#随机种子
np.random.seed(0)
x = np.arange(5)
y = np.random.randint(-5,5,5)
print (y)
#ncols有3列
#第一个图
# fig,axes = plt.subplots(ncols = 2)
# v_bars = axes[0].bar(x,y,color='red')
# h_bars = axes[1].barh(x,y,color='red')
#
# axes[0].axhline(0,color='grey',linewidth=2)
# axes[1].axvline(0,color='grey',linewidth=2)
# plt.show()
#第二个图
# fig,ax = plt.subplots()
# v_bars = ax.bar(x,y,color='lightblue')
# for bar,height in zip(v_bars,y):
# if height < 0:
# bar.set(edgecolor = 'darkred',color = 'green',linewidth = 3)
# plt.show()
#第三个图
# x = np.random.randn(100).cumsum()
# y = np.linspace(0,10,100)
#
# fig,ax = plt.subplots()
# ax.fill_between(x,y,color='lightblue')
# plt.show()
#第四个图
# x = np.linspace(0,10,200)
# y1 = 2*x +1
# y2 = 3*x +1.2
# y_mean = 0.5*x*np.cos(2*x) + 2.5*x +1.1
# fig,ax = plt.subplots()
# ax.fill_between(x,y1,y2,color='red')
# ax.plot(x,y_mean,color='black')
# plt.show()
#第五个图
# mean_values = [1,2,3]
# variance = [0.2,0.4,0.5]
# bar_label = ['bar1','bar2','bar3']
#
# x_pos = list(range(len(bar_label)))
# plt.bar(x_pos,mean_values,yerr=variance,alpha=0.3)
# max_y = max(zip(mean_values,variance))
# plt.ylim([0,(max_y[0]+max_y[1])*1.2])
# plt.ylabel('variable y')
# plt.xticks(x_pos,bar_label)
# plt.show()
#第六个图
# x1 = np.array([1,2,3])
# x2 = np.array([2,2,3])
#
# bar_labels = ['bat1','bar2','bar3']
# fig = plt.figure(figsize = (8,6))
# y_pos = np.arange(len(x1))
# y_pos = [x for x in y_pos]
#
# plt.barh(y_pos,x1,color='g',alpha = 0.5)
# plt.barh(y_pos,-x2,color='b',alpha = 0.5)
#
# plt.xlim(-max(x2)-1,max(x1)+1)
# plt.ylim(-1,len(x1)+1)
# plt.show()
#第七个图
# green_data = [1, 2, 3]
# blue_data = [3, 2, 1]
# red_data = [2, 3, 3]
# labels = ['group 1', 'group 2', 'group 3']
#
# pos = list(range(len(green_data)))
# width = 0.2
# fig, ax = plt.subplots(figsize=(8,6))
#
# plt.bar(pos,green_data,width,alpha=0.5,color='g',label=labels[0])
# plt.bar([p+width for p in pos],blue_data,width,alpha=0.5,color='b',label=labels[1])
# plt.bar([p+width*2 for p in pos],red_data,width,alpha=0.5,color='r',label=labels[2])
# plt.show()
#第八个图
# data = range(200, 225, 5)
#
# bar_labels = ['a', 'b', 'c', 'd', 'e']
#
# fig = plt.figure(figsize=(10,8))
#
# y_pos = np.arange(len(data))
#
# plt.yticks(y_pos, bar_labels, fontsize=16)
#
# bars = plt.barh(y_pos,data,alpha = 0.5,color='g')
#
# plt.vlines(min(data),-1,len(data)+0.5,linestyle = 'dashed')
# for b,d in zip(bars,data):
# plt.text(b.get_width()+b.get_width()*0.05,b.get_y()+b.get_height()/2,'{0:.2%}'.format(d/min(data)))
# plt.show()
#第九个图
# mean_values = range(10,18)
# x_pos = range(len(mean_values))
#
# import matplotlib.colors as col
# import matplotlib.cm as cm
#
# cmap1 = cm.ScalarMappable(col.Normalize(min(mean_values),max(mean_values),cm.hot))
# cmap2 = cm.ScalarMappable(col.Normalize(0,20,cm.hot))
#
# plt.subplot(121)
# plt.bar(x_pos,mean_values,color = cmap1.to_rgba(mean_values))
#
# plt.subplot(122)
# plt.bar(x_pos,mean_values,color = cmap2.to_rgba(mean_values))
#
# plt.show()
#第十个图
patterns = ('-', '+', 'x', '\\', '*', 'o', 'O', '.')
fig = plt.gca()
mean_value = range(1,len(patterns)+1)
x_pos = list(range(len(mean_value)))
bars = plt.bar(x_pos,mean_value,color='white')
for bar,pattern in zip(bars,patterns):
bar.set_hatch(pattern)
plt.show()
4、盒图
4.1 pycharm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#经常会用到 箱线图
#关心的是 中位数在哪里 均值在哪里
# tang_data = [np.random.normal(0,std,100) for std in range(1,4)]
# fig = plt.figure(figsize = (8,6))
# #第一个参数数据 vert是横竖显示
# bplot = plt.boxplot(tang_data,notch=False,sym='rs',vert=True,patch_artist=True)
# #对x轴进行操作
# plt.xticks([y+1 for y in range(len(tang_data))],['x1','x2','x3'])
# plt.xlabel('x')
# plt.title('box plot')
#
# plt.rcParams['font.sans-serif']=['SimHei'] #用来正常显示中文标签
# plt.rcParams['axes.unicode_minus']=False #纵坐标乱码更新
# plt.title('箱线图')
#
# colors = ['pink','lightblue','lightgreen']
# for pathch,color in zip(bplot['boxes'],colors):
# pathch.set_facecolor(color)
#
# plt.show()
#小提琴图
fig,axes = plt.subplots(nrows=1,ncols=2,figsize=(12,5))
#构造一个均值为0,方差为std的一个值,构造100个
tang_data = [np.random.normal(0,std,100) for std in range(6,10)]
axes[0].violinplot(tang_data,showmeans=False,showmedians=True)
axes[0].set_title('violin plot')
axes[1].boxplot(tang_data)
axes[1].set_title('box plot')
#加上水平线
for ax in axes:
#加上横线
ax.yaxis.grid(True)
ax.set_xticks([y+1 for y in range(len(tang_data))])
plt.setp(axes,xticks=[y+1 for y in range(len(tang_data))],xticklabels=['x1','x2','x3','x4'])
plt.show()
5、直方图and散点图
5.1 pycharm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
#第一个图直方图
# data = np.random.normal(0,20,1000)
# bins = np.arange(-100,100,5)
#
# plt.hist(data,bins=bins)
# plt.xlim([min(data)-5,max(data)+5])
# plt.show()
#第二个图直方图
# data1 = [random.gauss(15,10) for i in range(500)]
# data2 = [random.gauss(5,5) for i in range(500)]
# bins = np.arange(-50,50,2.5)
#
# plt.hist(data1,bins=bins,label='class 1',alpha = 0.3)
# plt.hist(data2,bins=bins,label='class 2',alpha = 0.3)
# plt.legend(loc='best')
# plt.show()
#第一个散点图
# mu_vec1 = np.array([0,0])
# cov_mat1 = np.array([[2,0],[0,2]])
#
# x1_samples = np.random.multivariate_normal(mu_vec1, cov_mat1, 100)
# x2_samples = np.random.multivariate_normal(mu_vec1+0.2, cov_mat1+0.2, 100)
# x3_samples = np.random.multivariate_normal(mu_vec1+0.4, cov_mat1+0.4, 100)
#
# plt.figure(figsize = (8,6))
# plt.scatter(x1_samples[:,0],x1_samples[:,1],marker ='x',color='blue',alpha=0.6,label='x1')
# plt.scatter(x2_samples[:,0],x2_samples[:,1],marker ='o',color='red',alpha=0.6,label='x2')
# plt.scatter(x3_samples[:,0],x3_samples[:,1],marker ='^',color='green',alpha=0.6,label='x3')
# plt.legend(loc='best')
# plt.show()
#第二个散点图
# x_coords = [0.13, 0.22, 0.39, 0.59, 0.68, 0.74, 0.93]
# y_coords = [0.75, 0.34, 0.44, 0.52, 0.80, 0.25, 0.55]
#
# plt.figure(figsize = (8,6))
# plt.scatter(x_coords,y_coords,marker='s',s=50)
#
# for x,y in zip(x_coords,y_coords):
# plt.annotate('(%s,%s)'%(x,y),xy=(x,y),xytext=(0,-15),textcoords = 'offset points',ha='center')
# plt.show()
#第三个散点图
mu_vec1 = np.array([0,0])
cov_mat1 = np.array([[1,0],[0,1]])
X = np.random.multivariate_normal(mu_vec1, cov_mat1, 500)
fig = plt.figure(figsize=(8,6))
R=X**2
R_sum=R.sum(axis = 1)
plt.scatter(X[:,0],X[:,1],color='grey',marker='o',s=20*R_sum,alpha=0.5)
plt.show()
6、3D图
6.1 pycharm
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
#
# fig = plt.figure()
# ax = Axes3D(fig)
#
# x = np.arange(-4,4,0.25)
# y = np.arange(-4,4,0.25)
#
# X,Y = np.meshgrid(x,y)
#
# Z = np.sin(np.sqrt(X**2+Y**2))
# ax.plot_surface(X,Y,Z,rstride = 1,cstride = 1,cmap='rainbow')
# ax.contour(X,Y,Z,zdim='z',offset = -2 ,cmap='rainbow')
#
# ax.set_zlim(-2,2)
# plt.show()
#第二个3D
# np.random.seed(1)
# def randrange(n,vmin,vmax):
# return (vmax-vmin)*np.random.rand(n)+vmin
#
#
# fig = plt.figure()
# ax = fig.add_subplot(111,projection = '3d')
# n = 100
# for c,m,zlow,zhigh in [('r','o',-50,-25),('b','x','-30','-5')]:
# xs = randrange(n,23,32)
# ys = randrange(n,0,100)
# zs = randrange(n,int(zlow),int(zhigh))
# ax.scatter(xs,ys,zs,c=c,marker=m)
# plt.show()
#条形3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for c, z in zip(['r', 'g', 'b', 'y'], [30, 20, 10, 0]):
xs = np.arange(20)
ys = np.random.rand(20)
cs = [c]*len(xs)
ax.bar(xs,ys,zs = z,zdir='y',color = cs,alpha = 0.5)
plt.show()
7、pie图
7.1 pycharm
import matplotlib.pyplot as plt
import numpy as np
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
#自定义图案标记
from matplotlib.patches import Circle, Wedge, Polygon, Ellipse
from matplotlib.collections import PatchCollection
m = 51212.
f = 40742.
m_perc = m/(m+f)
f_perc = f/(m+f)
colors = ['navy','lightcoral']
labels = ["Male","Female"]
plt.figure(figsize=(8,8))
paches,texts,autotexts = plt.pie([m_perc,f_perc],labels = labels,autopct = '%1.1f%%',explode=[0,0.05],colors = colors)
for text in texts+autotexts:
text.set_fontsize(20)
for text in autotexts:
text.set_color('white')
plt.show()
#设置子图布局
ax1 = plt.subplot2grid((3,3),(0,0))
ax2 = plt.subplot2grid((3,3),(1,0))
ax3 = plt.subplot2grid((3,3),(0,2),rowspan=3)
ax4 = plt.subplot2grid((3,3),(2,0),colspan = 2)
ax5 = plt.subplot2grid((3,3),(0,1),rowspan=2)
#嵌套图
# x = np.linspace(0,10,1000)
# y2 = np.sin(x**2)
# y1 = x**2
#
# fig,ax1 = plt.subplots()
#
# left,bottom,width,height = [0.22,0.45,0.3,0.35]
# ax2 = fig.add_axes([left,bottom,width,height])
#
# ax1.plot(x,y1)
# ax2.plot(x,y2)
# plt.show()
# def autolabel(rects):
# for rect in rects:
# height = rect.get_height()
# ax1.text(rect.get_x() + rect.get_width() / 2., 1.02 * height,
# "{:,}".format(float(height)),
# ha='center', va='bottom', fontsize=18)
#
#
# top10_arrivals_countries = ['CANADA', 'MEXICO', 'UNITED\nKINGDOM', \
# 'JAPAN', 'CHINA', 'GERMANY', 'SOUTH\nKOREA', \
# 'FRANCE', 'BRAZIL', 'AUSTRALIA']
# top10_arrivals_values = [16.625687, 15.378026, 3.934508, 2.999718, \
# 2.618737, 1.769498, 1.628563, 1.419409, \
# 1.393710, 1.136974]
# arrivals_countries = ['WESTERN\nEUROPE', 'ASIA', 'SOUTH\nAMERICA', \
# 'OCEANIA', 'CARIBBEAN', 'MIDDLE\nEAST', \
# 'CENTRAL\nAMERICA', 'EASTERN\nEUROPE', 'AFRICA']
# arrivals_percent = [36.9, 30.4, 13.8, 4.4, 4.0, 3.6, 2.9, 2.6, 1.5]
#
# fig, ax1 = plt.subplots(figsize=(20, 12))
# tang = ax1.bar(range(10), top10_arrivals_values, color='blue')
# plt.xticks(range(10), top10_arrivals_countries, fontsize=18)
# ax2 = inset_axes(ax1, width=6, height=6, loc=5)
# explode = (0.08, 0.08, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05)
# patches, texts, autotexts = ax2.pie(arrivals_percent, labels=arrivals_countries, autopct='%1.1f%%', explode=explode)
#
# for text in texts + autotexts:
# text.set_fontsize(16)
# for spine in ax1.spines.values():
# spine.set_visible(False)
#
# autolabel(tang)
# plt.show()
#小猫
fig, ax = plt.subplots()
patches = []
# Full and ring sectors drawn by Wedge((x,y),r,deg1,deg2)
leftstripe = Wedge((.46, .5), .15, 90,100) # Full sector by default
midstripe = Wedge((.5,.5), .15, 85,95)
rightstripe = Wedge((.54,.5), .15, 80,90)
lefteye = Wedge((.36, .46), .06, 0, 360, width=0.03) # Ring sector drawn when width <1
righteye = Wedge((.63, .46), .06, 0, 360, width=0.03)
nose = Wedge((.5, .32), .08, 75,105, width=0.03)
mouthleft = Wedge((.44, .4), .08, 240,320, width=0.01)
mouthright = Wedge((.56, .4), .08, 220,300, width=0.01)
patches += [leftstripe,midstripe,rightstripe,lefteye,righteye,nose,mouthleft,mouthright]
# Circles
leftiris = Circle((.36,.46),0.04)
rightiris = Circle((.63,.46),0.04)
patches += [leftiris,rightiris]
# Polygons drawn by passing coordinates of vertices
leftear = Polygon([[.2,.6],[.3,.8],[.4,.64]], True)
rightear = Polygon([[.6,.64],[.7,.8],[.8,.6]], True)
topleftwhisker = Polygon([[.01,.4],[.18,.38],[.17,.42]], True)
bottomleftwhisker = Polygon([[.01,.3],[.18,.32],[.2,.28]], True)
toprightwhisker = Polygon([[.99,.41],[.82,.39],[.82,.43]], True)
bottomrightwhisker = Polygon([[.99,.31],[.82,.33],[.81,.29]], True)
patches+=[leftear,rightear,topleftwhisker,bottomleftwhisker,toprightwhisker,bottomrightwhisker]
# Ellipse drawn by Ellipse((x,y),width,height)
body = Ellipse((0.5,-0.18),0.6,0.8)
patches.append(body)
# Draw the patches
colors = 100*np.random.rand(len(patches)) # set random colors
p = PatchCollection(patches, alpha=0.4)
p.set_array(np.array(colors))
ax.add_collection(p)
# Show the figure
plt.show()
8、Pandas与sklearn结合实例
8.1 pycharm
import pandas as pd
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from mpl_toolkits.mplot3d import Axes3D
# np.random.seed(0)
# df = pd.DataFrame({'Condition 1': np.random.rand(20),
# 'Condition 2': np.random.rand(20)*0.9,
# 'Condition 3': np.random.rand(20)*1.1})
# print(df.head())
#
# fig,ax = plt.subplots()
# #要不要进行堆叠
# df.plot.bar(ax=ax,stacked=True)
# plt.show()
#
#
# #占据百分比情况
#
# from matplotlib.ticker import FuncFormatter
#
# df_ratio = df.div(df.sum(axis=1),axis=0)
# fig,ax = plt.subplots()
# df_ratio.plot.bar(ax=ax,stacked=True)
# ax.yaxis.set_major_formatter(FuncFormatter(lambda y,_:'{:.0%}'.format(y)))
# plt.show()
#导入新的数据
#绝对路径
path = r'G:\nodebookPython3\lesson\data_file\risk_factors_cervical_cancer.csv'
df=pd.read_csv(path,na_values='?') #'?'变成NaN
#设置列名全部展示
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 200)
print(df.head())
print(df.info())
#然后进行均值填充
impute = pd.DataFrame(SimpleImputer().fit_transform(df))
impute.columns = df.columns
impute.index = df.index
print(impute.head())
#直接生成数据
features = impute.drop('Dx:Cancer', axis=1)
y = impute["Dx:Cancer"]
pca = PCA(n_components=3)
X_r = pca.fit_transform(features)
#打印当前比较重要的前三个特征
print("Explained variance:\nPC1 {:.2%}\nPC2 {:.2%}\nPC3 {:.2%}"
.format(pca.explained_variance_ratio_[0],
pca.explained_variance_ratio_[1],
pca.explained_variance_ratio_[2]))
fig = plt.figure()
#使用3D图画出散点图
ax = Axes3D(fig)
ax.scatter(X_r[:, 0], X_r[:, 1], X_r[:, 2], c=y, cmap=plt.cm.coolwarm)
# Label the axes
ax.set_xlabel('PC1')
ax.set_ylabel('PC2')
ax.set_zlabel('PC3')
plt.show()