title: Numpy 回顾及应用
category: 数据分析基础
Numpy的应用
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['STFangsong']
plt.rcParams['axes.unicode_minus'] = False
%config InlineBackend.figure_format = 'svg'
创建数组
# 方式1:np.array() 通过array函数将list转换成ndarray
array1 = np.array([1,2,3,4,5],dtype = 'i8')
array1
array([1, 2, 3, 4, 5], dtype=int64)
# 数组类型
type(array1)
numpy.ndarray
# 数组大小
array1.size
5
# 数组元素占内存大小
array1.itemsize
8
# 数组元素所占空间
array1.nbytes
40
# 数组维度
array1.ndim
1
# 通过arange函数指定取值范围创建ndarray
array2 = np.arange(1,100,2)
array2
array([ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33,
35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99])
# 通过linspace(下限,上限,元素个数)函数构造等差数列
array3 = np.linspace(-5,5,101)
array3
array([-5. , -4.9, -4.8, -4.7, -4.6, -4.5, -4.4, -4.3, -4.2, -4.1, -4. ,
-3.9, -3.8, -3.7, -3.6, -3.5, -3.4, -3.3, -3.2, -3.1, -3. , -2.9,
-2.8, -2.7, -2.6, -2.5, -2.4, -2.3, -2.2, -2.1, -2. , -1.9, -1.8,
-1.7, -1.6, -1.5, -1.4, -1.3, -1.2, -1.1, -1. , -0.9, -0.8, -0.7,
-0.6, -0.5, -0.4, -0.3, -0.2, -0.1, 0. , 0.1, 0.2, 0.3, 0.4,
0.5, 0.6, 0.7, 0.8, 0.9, 1. , 1.1, 1.2, 1.3, 1.4, 1.5,
1.6, 1.7, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,
2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7,
3.8, 3.9, 4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8,
4.9, 5. ])
%timeit list(range(1000000))
27.5 ms ± 935 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit np.arange(1000000)
1.74 ms ± 21 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
# 通过随机的方式创建ndarray对象
array4 = np.random.randint(60,101,15)
array5 = np.random.random(10)
array4,array5
(array([67, 85, 65, 88, 66, 64, 99, 90, 74, 83, 99, 74, 71, 77, 94]),
array([0.22794581, 0.3156536 , 0.06871486, 0.34238374, 0.20595198,
0.73242699, 0.55180204, 0.06225053, 0.94129138, 0.73184603]))
# 创建二维数组
array6 = np.array([[1,2,3],[2,3,4],[3,4,5]])
array6
array([[1, 2, 3],
[2, 3, 4],
[3, 4, 5]])
array6.ndim
2
array6.shape
(3, 3)
# 随机创建二维数组
array7 = np.random.randint(60,101,(5,3))
array7
array([[72, 98, 94],
[96, 90, 92],
[81, 60, 69],
[84, 74, 81],
[74, 81, 92]])
# 0数组
array8 = np.zeros((5,5),dtype = 'i8')
array8
array([[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0],
[0, 0, 0, 0, 0]], dtype=int64)
# 1数组
array9 = np.ones((5,5),dtype = 'i8')
array9
array([[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1],
[1, 1, 1, 1, 1]], dtype=int64)
# np.full(数组形状,填充值)
np.full((5,5),100)
array([[100, 100, 100, 100, 100],
[100, 100, 100, 100, 100],
[100, 100, 100, 100, 100],
[100, 100, 100, 100, 100],
[100, 100, 100, 100, 100]])
# 单位矩阵
array10 = np.eye(10,dtype = 'i8')
array10
array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1]], dtype=int64)
# 一维变二维 reshape
array11 = array4.reshape((5,3))
array11
array([[67, 85, 65],
[88, 66, 64],
[99, 90, 74],
[83, 99, 74],
[71, 77, 94]])
# 扁平化 二维变一维
array12 = array11.flatten()
array12
array([67, 85, 65, 88, 66, 64, 99, 90, 74, 83, 99, 74, 71, 77, 94])
type(array12.tolist())
list
数组索引
# 花式索引 同索引可取多次
array12[[1,2,1,3]]
array([85, 65, 85, 88])
# 布尔索引
array12[array12 >=80]
array([85, 88, 99, 90, 83, 99, 94])
array12[array12 % 2 != 0]
array([67, 85, 65, 99, 83, 99, 71, 77])
# 布尔索引 & 花式索引
array12[(array12>=80) & (array12 % 2 != 0)]
array([85, 99, 83, 99])
# 布尔索引 |花式索引
array12[(array12>=80) | (array12 % 2 != 0)]
array([67, 85, 65, 88, 99, 90, 83, 99, 71, 77, 94])
# 切片
array12[3:8],array12[3:8:2],array12[::-1]
(array([88, 66, 64, 99, 90]),
array([88, 64, 90]),
array([94, 77, 71, 74, 99, 83, 74, 90, 99, 64, 66, 88, 65, 85, 67]))
二维数组索引&切片
# 二维数组索引
array6[2,1]
4
# 二维数组取多个元素 - 二维数组花式索引 array[[0轴],[1轴]]
array6[[0,0,2,2],[0,2,0,2]]
array([1, 3, 3, 5])
# 二维数组布尔索引
array6[(array6 > 2) & (array6 % 2 == 0)]
array([4, 4])
array6
array([[1, 2, 3],
[2, 3, 4],
[3, 4, 5]])
# 二维数组切片,先且0轴(行)再切1轴(列),逗号隔开
array6[1:,2:]
array([[4],
[5]])
# 左上
array6[:2,:2]
array([[1, 2],
[2, 3]])
# 右下
array6[1:,1:]
array([[3, 4],
[4, 5]])
array6[::2,::2]
array([[1, 3],
[3, 5]])
三维数组(图片)
# 图片
array13 = plt.imread('URL')
array13.shape # (1006, 1440, 3) - 高,宽,三原色
(1006, 1440, 3)
# 水平翻转以下切片均以自己导入图片尺寸为准
plt.imshow(array13[:,::-1,:])
# 垂直翻转
plt.imshow(array13[::-1,:,:])
# 抠图
plt.imshow(array13[:600,400:1100,:])
# 三原色翻转
plt.imshow(array13[:,:,::-1])
获取描述性统计信息
- 均值(mean)
x ˉ = 1 N ∑ i = 1 N x i \bar{x} = \frac{1}{N}\sum_{i=1}^{N}{x_i} xˉ=N1i=1∑Nxi - 中位数(median)
将数据按照升序或降序排列后位于中间的。
- 众数
数据集合中出现次数最多的数据,代表数据的一般水平。
-
最大值/最小值(max/min) - 极差(ptp)
-
方差/标准差(var/std)
总体方差:
σ
2
=
1
N
∑
i
=
1
N
(
x
i
−
μ
)
2
\sigma^2 = \frac{1}{N}\sum_{i=1}^{N}(x_i - \mu)^2
σ2=N1i=1∑N(xi−μ)2
总体标准差:
σ
=
1
N
∑
i
=
1
N
(
x
i
−
μ
)
2
\sigma = \sqrt{\frac{1}{N}\sum_{i=1}^{N}(x_i - \mu)^2}
σ=N1i=1∑N(xi−μ)2
样本方差:
S
2
=
1
N
−
1
∑
i
=
1
N
(
x
i
−
x
ˉ
)
2
S^2 = \frac{1}{N-1}\sum_{i=1}^{N}(x_i - \bar{x})^2
S2=N−11i=1∑N(xi−xˉ)2
样本标准差:
S
=
1
N
−
1
∑
i
=
1
N
(
x
i
−
x
ˉ
)
2
2
S = \sqrt[2]{\frac{1}{N-1}\sum_{i=1}^{N}(x_i - \bar{x})^2}
S=2N−11i=1∑N(xi−xˉ)2
6. 四分位距离
I
Q
R
=
Q
3
−
Q
1
IQR = Q_3 - Q_1
IQR=Q3−Q1
array14 = np.random.randint(20,51,10)
array14
array([43, 23, 31, 27, 47, 44, 29, 30, 49, 38])
# 均值
array14.mean()
36.1
# 排序
np.sort(array14)
array([23, 27, 29, 30, 31, 38, 43, 44, 47, 49])
# 中位数
np.median(array14)
34.5
# 极差 np.max()-np.min() / np.ptp
array14.max() - array14.min(),np.ptp(array14)
(26, 26)
# 标准差
np.std(array14)
8.757282683572571
# 方差
np.var(array14)
76.69000000000001
# 四分位数
# 下四分位数
q1 = np.quantile(array14,0.25)
# 中位数 - 百分之50分位数
q2 = np.quantile(array14,0.5)
# 上四分位数
q3 = np.quantile(array14,0.75)
q1,q2,q3
(29.25, 34.5, 43.75)
# 四分位距离
IQR = q3 - q1
IQR
# 极端值判断
# >Q3 + 1.5*IQR or < Q1 - 1.5*IQR
14.5
数组的其他方法
array14
array([43, 23, 31, 27, 47, 44, 29, 30, 49, 38])
# 累计和
array14.cumsum()
array([ 43, 66, 97, 124, 171, 215, 244, 274, 323, 361], dtype=int32)
# 最大/最小值索引
np.argmax(array14),np.argmin(array14)
(8, 1)
# 保存数组到文件中
# 将数组对象以pickle协议进行序列化(把对象变成了bytes最后写到文件中)
# 存
# array14.dump('a')
# 读
# array15 = np.load('a',allow_pickle=True)
# array14,array15
array16 = np.array([0,1,0,1,0,0,2,5,0])
# 非零元素下标
array16.nonzero()
(array([1, 3, 6, 7], dtype=int64),)
# 将nonzero()作为花式索引获取非零元素
array16[array16.nonzero()]
array([1, 1, 2, 5])
# 交换指定的轴
plt.imshow(array13.swapaxes(0,1)) # 将0,1轴互换
# 转置
array11.transpose()
array([[67, 88, 99, 83, 71],
[85, 66, 90, 99, 77],
[65, 64, 74, 74, 94]])
arraya = np.array([42,45,62,56,35,79,67,74,30,28,54])
arrayb = np.array([65,36,123,25,45,32,26,78,57,51,34])
arrayc = np.array([82,36,21,23,25,43,52,59,60,76,95])
suma = np.sum(arraya)
sumb = np.sum(arrayb)
sumc = np.sum(arrayc)
suma,sumb,sumc
(572, 572, 572)
avga = np.mean(arraya)
avgb = np.mean(arrayb)
avgc = np.mean(arrayc)
avga,avgb,avgc
(52.0, 52.0, 52.0)
stda = np.std(arraya)
stdb = np.std(arrayb)
stdc = np.std(arrayc)
np.round(stda,2),np.round(stdb,2),np.round(stdc,2)
(16.67, 27.55, 23.94)
media = np.median(arraya)
medib = np.median(arrayb)
medic = np.median(arrayc)
media,medib,medic
(54.0, 45.0, 52.0)
np.var(arraya)
277.8181818181818
数组与数组之间的运算
# +、-、*、/ 、** 指定位置元素做运算
arraya + arrayb
array([107, 81, 185, 81, 80, 111, 93, 152, 87, 79, 88])
array11.transpose()
array([[67, 88, 99, 83, 71],
[85, 66, 90, 99, 77],
[65, 64, 74, 74, 94]])
# 广播机制
arrayd = np.array([5,5,5,5,5])
array11.transpose() + arrayd
array([[ 72, 93, 104, 88, 76],
[ 90, 71, 95, 104, 82],
[ 70, 69, 79, 79, 99]])
arraye = np.array([[5],[5],[5]])
arraye + array11.transpose()
array([[ 72, 93, 104, 88, 76],
[ 90, 71, 95, 104, 82],
[ 70, 69, 79, 79, 99]])
当两个数组形状不一致时,如果两个数组的后缘维度(shape属性从后往前看〉相同或者其中一个的后缘维度为1,那么这个时候可以通过广播机制让两个数组的形状趋于一致,这种情况是可以进行运算的;如果不能应用广播机制,那么两个数组没有办法进行运算。
arrayn = np.array([1,2,np.nan,3,4,np.nan,5,np.inf])
arrayn
array([ 1., 2., nan, 3., 4., nan, 5., inf])
# 取反 True -> False
arrayn[~np.isnan(arrayn)]
array([ 1., 2., 3., 4., 5., inf])
np.isinf(arrayn)
array([False, False, False, False, False, False, False, True])
# 双曲余弦函数 cosh()
x = np.linspace(-4*np.pi, 4*np.pi, 1000)
plt.plot(x, np.cosh(x))
plt.show()
函数
arr1 = np.array(np.random.randint(1,10,(4,4)))
arr2 = np.array(np.random.randint(1,10,(4,4)))
arr1,arr2
(array([[4, 2, 2, 9],
[4, 1, 8, 8],
[8, 9, 4, 6],
[9, 7, 9, 7]]),
array([[8, 3, 5, 3],
[6, 7, 2, 8],
[1, 2, 3, 3],
[3, 8, 8, 7]]))
# 水平分割数组
np.hsplit(arr1,4)
# 垂直分割
np.vsplit(arr1,4)
[array([[4, 2, 2, 9]]),
array([[4, 1, 8, 8]]),
array([[8, 9, 4, 6]]),
array([[9, 7, 9, 7]])]
# 水平堆叠(垂直堆叠:vstack())
np.hstack((arr1,arr2))
array([[4, 2, 2, 9, 8, 3, 5, 3],
[4, 1, 8, 8, 6, 7, 2, 8],
[8, 9, 4, 6, 1, 2, 3, 3],
[9, 7, 9, 7, 3, 8, 8, 7]])
# 指定获取元素
np.select([(arr1<3),(arr1>5)],[arr1,arr1 ** 2])
array([[ 0, 2, 2, 81],
[ 0, 1, 64, 64],
[64, 81, 0, 36],
[81, 49, 81, 49]])
np.where(arr1<5,arr1,arr1 ** 2)
array([[ 4, 2, 2, 81],
[ 4, 1, 64, 64],
[64, 81, 4, 36],
[81, 49, 81, 49]])
np.extract(arr1 % 2 == 0,arr1)
array([4, 2, 2, 4, 8, 8, 8, 4, 6])
# 沿指定的轴翻转数组元素
# 按0轴翻转(行)
np.flip(arr1,axis = 0)
array([[9, 7, 9, 7],
[8, 9, 4, 6],
[4, 1, 8, 8],
[4, 2, 2, 9]])
# 按1轴翻转(列)
np.flip(arr1,axis = 1)
array([[9, 2, 2, 4],
[8, 8, 1, 4],
[6, 4, 9, 8],
[7, 9, 7, 9]])
# 默认,1、0均翻转
np.flip(arr1)
array([[7, 9, 7, 9],
[6, 4, 9, 8],
[8, 8, 1, 4],
[9, 2, 2, 4]])
# 重复数组
arr3 = np.repeat([3,5],10)
arr3
array([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5])
# 按指定轴重复
arr4 = np.array([[1,3],[2,4]])
np.repeat(arr4,2,axis = 1)
array([[1, 1, 3, 3],
[2, 2, 4, 4]])
# 指定重复次数
np.repeat(arr4,[2,3],axis = 1)
array([[1, 1, 3, 3, 3],
[2, 2, 4, 4, 4]])
# 滚动数组
np.roll(arr3,5)
array([5, 5, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5, 5, 5, 5])
# 改变数组形状 可改变大小
np.resize(arr1,(5,5))
array([[4, 2, 2, 9, 4],
[1, 8, 8, 8, 9],
[4, 6, 9, 7, 9],
[7, 4, 2, 2, 9],
[4, 1, 8, 8, 8]])
np.place(arr1,arr1<3,999)
arr1
array([[ 4, 999, 999, 9],
[ 4, 999, 8, 8],
[ 8, 9, 4, 6],
[ 9, 7, 9, 7]])
arr5 = arraya + arrayb
# 以指定topN排序数组(界前小于该数界后大于该数)仍然无序
np.partition(arr5,7)
array([ 87, 88, 79, 81, 80, 81, 93, 107, 152, 185, 111])
# 矩阵的转置
arr2.T
array([[8, 6, 1, 3],
[3, 7, 2, 8],
[5, 2, 3, 8],
[3, 8, 3, 7]])
# 矩阵的乘法(以e二维数组创建矩阵)
arr2 @ arr1
array([[ 111, 11055, 8063, 147],
[ 140, 13061, 6130, 178],
[ 63, 3045, 1054, 64],
[ 171, 11110, 3156, 188]])
# 逆矩阵
np.linalg.inv(arr2)
array([[ 0.13184584, 0.03448276, -0.09127789, -0.05679513],
[-0.04462475, 0.03448276, -0.73833671, 0.29614604],
[ 0.06085193, -0.13793103, 0.18864097, 0.05070994],
[-0.07505071, 0.10344828, 0.6673428 , -0.22920892]])
# 创建矩阵
mat1 = np.matrix('1,2,3;4,5,6')
mat1
matrix([[1, 2, 3],
[4, 5, 6]]
mat2 = np.asmatrix(np.array([[1,4],[2,5],[3,6]]))
mat2
matrix([[1, 4],
[2, 5],
[3, 6]])
# 矩阵相乘
mat3 = mat1 * mat2
mat3
matrix([[14, 32],
[32, 77]])
# 逆矩阵
mat3 ** -1
matrix([[ 1.42592593, -0.59259259],
[-0.59259259, 0.25925926]])
矩阵计算
mat3 = np.arange(1,10).reshape((3,3))
# 计算行列式
np.linalg.det(mat3)
0.0
# 矩阵的秩
np.linalg.matrix_rank(mat3)
2
# 逆矩阵(奇异矩阵没有逆矩阵(不满秩))
# np.linalg.inv(mat3)
# 计算最小二乘解
x1 = np.array([0, 1, 2, 3])
y1 = np.array([-1, 0.2, 0.9, 2.1])
A = np.vstack([x1, np.ones(len(x1))]).T
m, c = np.linalg.lstsq(A, y1, rcond=None)[0]
m,c
(0.9999999999999999, -0.9499999999999997)
#
a = np.array([[3,1], [1,2]])
b = np.array([9,8])
x = np.linalg.solve(a, b)
x
array([2., 3.])