NumPy,Pandas,Matplotlib随记

再睡灬五分钟

已于 2022-07-06 09:50:11 修改

阅读量1.5k

点赞数 1

文章标签： python 数据分析 pandas numpy matplotlib

于 2021-11-10 11:30:45 首次发布

本文链接：https://blog.csdn.net/qq_40696594/article/details/121244960

版权

Numpy

axis=0表示跨行，axis=1表示跨列
numpy—>ravel()函数获取一个展平数组

np.apply_along_axis() 函数，将函数应用于数组指定轴

np.random.seed(100)
a = np.random.randint(1,10, [5,3])
a
array([[9, 9, 4],
       [8, 8, 1],
       [5, 3, 6],
       [3, 3, 3],
       [2, 1, 9]])
np.apply_along_axis(lambda x: np.min(x)/np.max(x), arr=a, axis=1)
array([0.44444444, 0.125     , 0.5       , 1.        , 0.11111111])

numpy-- 连续使用两次argsort()函数获取数组每个元素对应索引数组；

a=np.array([ 9,  4, 15,  0, 17, 16, 17,  8,  9,  0])  
a.argsort().argsort()
array([4, 2, 6, 0, 8, 7, 9, 3, 5, 1], dtype=int64)

花式索引根据索引整型数组的值作为目标数组的某个轴的下标来取值,所以需要这些整数数组的元素个数要相等，这样才能够将整数数组映射成下标

arr2d = np.array([[1,3,5,7,9],[2,4,6,8,10],[12,18,22,23,37],
[123,55,17,88,103]])
arr2d
array([[  1,   3,   5,   7,   9],
   [  2,   4,   6,   8,  10],
   [ 12,  18,  22,  23,  37],
   [123,  55,  17,  88, 103]])
-- -----------------------------------------------
arr2d[[1,3,3,3],[1,1,1,1]]  -- >array([ 4, 55, 55, 55])
arr2d[[1,3,3,3],[1,1,1]] 
-->shape mismatch: indexing arrays could not be broadcast together with shapes (4,) (3,)

使用np.lexsort()函数根据2列或更多列对 numpy 数组进行排序(将第一需求列放在右侧)

	np.random.seed(100)
	sort_tem=np.random.randint(1,20,size=(8,4))
	sort_tem
	array([[ 9,  4,  8, 16],
       [17, 11,  3,  3],
       [ 3, 15,  3, 18],
       [17, 16,  5, 12],
       [17, 10,  3, 13],
       [ 5,  2, 14,  5],
       [ 5,  4,  8, 18],
       [16,  2, 15,  8]])
    sort_index=np.lexsort((sort_tem[:,1],sort_tem[:,0]))
    sort_tem[sort_index]
    array([[ 3, 15,  3, 18],
       [ 5,  2, 14,  5],
       [ 5,  4,  8, 18],
       [ 9,  4,  8, 16],
       [16,  2, 15,  8],
       [17, 10,  3, 13],
       [17, 11,  3,  3],
       [17, 16,  5, 12]])

np.take()函数，沿指定轴从array中获取元素

pd.Series(np.take(list('abc'),np.random.randint(3,size=10)))
0    b
1    b
2    c
3    b
4    c
5    a
6    a
7    c
8    c
9    a

Pandas

Numpy和Pandas的区别
Pandas模块主要处理表格数据, 而NumPy模块处理数字数据。
Pandas提供了一些强大的工具集, 例如DataFrame和Series, 主要用于分析数据, 而NumPy模块提供了一个强大的对象, 称为Array。

pandas删除、修改数据库表中数据时会报错"ResourceClosedError’',此时可进行异常捕获

from sqlalchemy import create_engine  # 数据库引擎，构建和数据库的连接
from sqlalchemy import exc    # 用于捕捉sqlalchemy的异常

engine = create_engine('mysql+pymysql://root:root@localhost/lgtest?charset=utf8')
try:
	pd.read_sql_query('delete from salary where 计算机=85',con=engine)
except exc.ResourceClosedError:
	print("捕获ResourceClosedError异常，删除成功")
---------------------------------------------------------------------------------------
# 若不想使用异常捕获可追加chunksize=100
 pd.read_sql_query('delete from salary where 计算机=81',con=engine,chunksize=100)

Pandas删除重复列

df=pd.DataFrame(data={'color':['red','blue','red','green','blue',None,'red'],
                 	  'price':[10,20,10,15,20,0,np.NaN]})
df=pd.concat([df,df],axis=1)
# 删除重复列
df.loc[:,df.columns.duplicated()]
# 删除重复列
df.T.drop_duplicates().T

Series.str.extract(pat, flags=0, expand=True)：将 regex pat 中的捕获组提取为 datatrame 中的列
返回一个DataFrame,每个主题字符串为一行，每个组为一列

Pandas将Series转化成DataFrame，使用to_frame()函数

type(ser)   -- pandas.core.series.Series
type(ser.to_frame())  -- pandas.core.frame.DataFrame

判断数据在Series A 中而不再Series B中，使用isin()方法

ser1 = pd.Series([1, 2, 3, 4, 5])
ser2 = pd.Series([4, 5, 6, 7, 8])
ser1.isin(ser2)
0    False
1    False
2    False
3     True
4     True
dtype: bool

Pandas 根据两列数据排序

df=pd.DataFrame(data=np.random.randint(0,20,size=(10,2)),
             index=list('ABCDEFGHIJ'),
             columns=['Python','Chinese'])

df.sort_values(by=['Python','Chinese'],ascending=[False,True])
Python	Chinese
F	17	17
I	16	12
C	12	12
G	12	16
J	10	4
H	9	8
D	7	8
B	7	16
A	3	13
E	1	1

Matplotlib

matplotlib知识点
图形样式控制

   plt.plot(x,y)	# 绘制线
   plt.grid(ls='--',c='green',alpha=0.5)		# 网格线		
   plt.xlim([-1,10])   # 坐标轴范围
   plt.ylim([-1.5,1.5])
   plt.yticks([-1,0,1],['min',0,'max'],fontsize=18)  # 坐标轴的刻度
   plt.ylabel('y=sin(x)',			# 坐标轴标签
          rotation=50,  			# 标签翻转角度
          fontsize=18,
          ha='right')				# 标签位置（居左居右等）
          
   plt.legend(['Sin','Cos'],        #图例设置
   			fontsize=18,
          		loc='center',   # 图例位置
          		ncol=2,   # 图例显示列数，默认为 1
         	    bbox_to_anchor=(0.5,1.1))  # 移动图例位置（此时表现为正上方居中）

   # 移动轴脊、spines 轴脊——记录数据区边界的线。
   axes=plt.gca()   #  get current axes的缩写 轴面，子视图
   axes.spines['top'].set_color('white')   
   axes.spines['right'].set_color('white')
   axes.spines['left'].set_position(('data',0))   # data 表示数据
   axes.spines['bottom'].set_position(('data',0))

   # 保存图片
   plt.savefig('./image.png',  
               dpi=100,    # 图片大小
               facecolor='red',   # 轴面与大图片之间的颜色
               edgecolor='lightgreen', # 边界的颜色
               bbox_inchs='tight') # 设置紧凑显示，保存整张图片
               
   -- 多图布局
   # 子视图，此处表现为 两行，上面一行两列，下面一行一列
   axes=plt.subplot(2,2,1)    # 子视图，轴面，左上角位置
   axes=plt.subplot(222)   # 数字连起来写
   axes=plt.subplot(212)
   
   # 均匀分布，3*3
   fig,((ax11,ax12,ax13),(ax21,ax22,ax23),(ax31,ax32,ax33))=plt.subplots(3,3)
   plt.tight_layout()  # 紧凑布局
   
   # 不均匀布局
   from matplotlib import gridspec
   gs=gridspec.GridSpec(3,3)  # 3行3列
   ax=plt.subplot(gs[0,:])  # 第一行全部
   ax=plt.subplot(gs[1,0:2]) # 第二行和第1 2 列
   ax=plt.subplot(gs[1:,-1])	# 第2行最后一列
   ax=plt.subplot(gs[-1,0])	# 最后一行，第1列
   ax=plt.subplot(gs[-1,1])	# 最后一行，第2列
   --
   # 双轴，两个子视图公用一个X轴，两个Y轴
   ax1=plt.subpplot(111)
   ax2=ax1.twinx()

   # 图片中插入文本、标题
   plt.text(3,0.5,  # 此处的数字为X轴Y轴的位置
   		r'$exp(-x)*cos(2πx)$',fontdict={'color':'red','fontsize':'18'})
   # 标题		
   plt.title('Exp decay',fontdict={'color':'green','fontsize':'18','rotation':'30','alpha':0.5})	
   plt.suptitle('指数衰减',fontfamily='Kaiti',fontsize=20)

   # 注释（比如给图形最值添加注释）
   plt.annotate('max',  # 文本内容
            xy=(np.pi/2,1), # 文本指向的坐标点位置
            xytext=(3,1.5),  # 文本位置
            arrowprops={'width':2,'headwidth':6,'headlength':15,'shrink':0})   # 箭头样式

各类图形

   plt.plot()   #折线图
   plt.bar()	#柱状图
   # 极坐标折线图、柱状图
   ax = plt.subplot(111,projection = 'polar',facecolor = 'lightgreen') # 极坐标
   						或
   ax = plt.subplot(111,polar=True,facecolor = 'lightgreen') # 极坐标
   ax.plot(x,y,color = 'red') # 折线图，
   ax.bar(x,y,width = np.pi/4,color = np.random.rand(8,3)) # 柱状图
   
   # 直方图
   # n是直方图单元的值，bins表示每个宽度的范围
   x=np.random.randn(10000)
   n,bins, fig=plt.hist(x,
                bins=100, # bins如果是整数，则表示定义范围内等宽bin的数量
                color='red',
                density=True)  # 如果为真，此时返回的n一个概率密度: (density = counts/(sum (counts) * np.diff (bin))))
                               # 例如第一个  3/10000*（3.40454794-3.31973812）=0.00353733
                               # 默认为False,此时返回的n表示每个bin内数值的个数
    
    # 箱式图
    plt.boxplot(x,labels=labels,
         sym = 'y',  # 异常值颜色
         notch =True)  # 默认为false,绘制矩形方框图，为true是绘制有缺口的方框图（中位数位置）
     
     #散点图
     x=np.random.randn(100)
   	plt.scatter(x,x + np.random.randn(100)*0.2,
   	            color=np.random.rand(100,3),  # 颜色
   	            s=np.random.randint(100,300,size=100),  # 每个点的大小
   	            alpha=0.5)

   # 饼图
   	p=np.random.randint(10,100,size=5)
   	labels=['一星','二星','三星','四星','五星']
   	plt.figure(figsize=(9,9))
   	_=plt.pie(x=p,labels=labels,
                textprops={'family':'Kaiti','fontsize':18},  # 设置字体样式
                autopct='%0.2f%%',   # 显示百分比
                explode=[0,0,0,0,0.1], # 突出一部分
                shadow=True)   # 增加阴影

   # 嵌套饼图
   	plt.rcParams['font.family']='Kaiti'  # 全局配置
   	plt.rcParams['font.size']=18
   	plt.figure(figsize=(9,9))
   	
   	p1=np.random.randint(30,50,size=3)  # 外圈
   	_=plt.pie(x=p1,labels=['Cat','Dog','Brid'],
   	             autopct='%0.2f%%',
   	             pctdistance=0.85,
   	             radius=1,
   	              # 传递给pie的参数类型
   	             wedgeprops ={'linewidth':5,    # 线宽（饼图外围和切片之间的宽度）
   	                          'width':0.3,      # 饼图的宽度
   	                          'edgecolor':'white'})  # 线宽的颜色
   	p2=np.random.randint(10,80,size=6)
   	_=plt.pie(x=p2,
   	             autopct='%0.2f%%',
   	             radius=0.7,
   	             wedgeprops ={'linewidth':5,'width':0.7,'edgecolor':'white'})

   # 热力图，根据指定数据绘制图片
   	data=np.random.randn(7,7)*5
   	data=data.round(1)
   	plt.figure(figsize=(9,9))
   	plt.imshow(data,cmap=plt.cm.RdYlGn_r)    # 根据data数据绘制图片,设置图片颜色倾向 RdYlGn_r
   	# 将数字写入到每个数格中
   	for i in range(7):
   	    for j in range(7):
   	        plt.text(x=j,y=i,s=data[i,j],ha='center')

   # 面积图
   	plt.stackplot()
   	
   	days=np.arange(1,6)
   	working=np.array([8,9,7,8,11])
   	sleeping=np.array([6,7,5,8,7])
   	eating=np.array([3,2,3,1,3])
   	playing=np.array([7,6,9,7,3])
   	plt.figure(figsize=(9,6))
   	plt.stackplot(days,working,sleeping,eating,playing)
   
   #---- 蜘蛛图---------
   labels=np.array(['EQ','IQ','个人能力','团队意识','持续学习','解决问题能力'])
   plt.rcParams['font.family']='Kaiti'
   angles=np.arange(0,2*np.pi,np.pi/3)
   stats=np.random.randint(50,120,size=6)
   # 首尾相连，与原来相比多了一个数据
   angles=np.concatenate([angles,angles[[0]]])
   stats=np.concatenate([stats,stats[[0]]])
   # 绘图
   plt.figure(figsize=(9,9))
   axes=plt.subplot(111,polar=True)
   axes.plot(angles,stats,marker='o',lw=2)  # 绘制折线图
   axes.fill(angles,stats,alpha=0.2)   # 添加阴影
   # 添加标签
   _=axes.set_thetagrids(angles[:-1]*180/np.pi,labels=labels,fontsize=18)