1 numpy的优势
1.1 介绍
开源的python科学计算库,用于快速处理任意维度的数组
是一个快速而灵活的大数据容器
1.2 ndarray介绍
n维数组类型
- ndarray存储数据
import numpy as np
score = np.array([[80, 89, 86, 67, 79],
[78, 97, 89, 67, 81],
[90, 94, 78, 67, 74],
[91, 91, 90, 67, 69],
[76, 87, 75, 67, 86],
[70, 79, 84, 67, 84],
[94, 92, 93, 67, 64],
[86, 85, 83, 67, 80]])
score
array([[80, 89, 86, 67, 79],
[78, 97, 89, 67, 81],
[90, 94, 78, 67, 74],
[91, 91, 90, 67, 69],
[76, 87, 75, 67, 86],
[70, 79, 84, 67, 84],
[94, 92, 93, 67, 64],
[86, 85, 83, 67, 80]])
1.3 ndarray与python原生list运算效率对比
import random
import time
import numpy as np
a = []
for i in range(10000000):
a.append(random.random())
%time sum1 = sum(a)
b = np.array(a)
%time sum2 = np.sum(b)
CPU times: user 199 ms, sys: 396 ms, total: 595 ms
Wall time: 667 ms
CPU times: user 16.7 ms, sys: 736 µs, total: 17.4 ms
Wall time: 16.5 ms
1.4 ndarrat的优势
- 内存块风格
ndarray在存储数据的时候,数据与数据地址都是连续的
而python原生list只能通过寻址方式找到下一个元素
- ndarray支持并行化运算
- 底层使用C语言,内部解除了GIL
2 N维数组-ndarray
2.1 ndarray的属性
a = np.array([[1,2,3],[4,5,6]])
b = np.array([1,2,3,4])
c = np.array([[[1,2,3],[4,5,6]],[[1,2,3],[4,5,6]]])
- 数组维度的元组
print(a.shape)
print(b.shape)
print(c.shape)
(2, 3)
(4,)
(2, 2, 3)
- 数组维度
print(a.ndim)
print(b.ndim)
print(c.ndim)
2
1
3
- 数组中的元素数量
print(a.size)
print(b.size)
print(c.size)
6
4
12
- 数组元素的长度(字节)
print(a.itemsize)
print(b.itemsize)
print(c.itemsize)
8
8
8
- 数组元素的类型
print(a.dtype)
print(b.dtype)
print(c.dtype)
int64
int64
int64
2.2 ndarry的类型
np.bool
np.int8-64
np.uint8-64
np.float16-64
np.complex64-128
np.object_
np.string_
np.unicode_
# 创建数组时指定类型
a = np.array([[1,2,3], [4,5,6]], dtype=np.float32)
print(a)
a.dtype
[[1. 2. 3.]
[4. 5. 6.]]
dtype('float32')
3 基本操作
3.1 生成0和1的数组
zero = np.zeros([3,4])
zero
array([[0., 0., 0., 0.],
[0., 0., 0., 0.],
[0., 0., 0., 0.]])
ones = np.ones_like(zero)
ones
array([[1., 1., 1., 1.],
[1., 1., 1., 1.],
[1., 1., 1., 1.]])
3.2 从现有数组生成
a = np.array([[1,2,3],[4,5,6]])
# 深拷贝
a1 = np.array(a)
# 浅拷贝
a2 = np.asarray(a)
a
array([[1, 2, 3],
[4, 5, 6]])
a[1] = 0
a
array([[1, 2, 3],
[0, 0, 0]])
a1
array([[1, 2, 3],
[4, 5, 6]])
a2
array([[1, 2, 3],
[0, 0, 0]])
a = np.array([[1,2,3],[4,5,6]])
# 深拷贝
a1 = np.array(a)
# 浅拷贝
a2 = np.asarray(a)
a[1][0] = 1
a
array([[1, 2, 3],
[1, 5, 6]])
a1
array([[1, 2, 3],
[4, 5, 6]])
a2
array([[1, 2, 3],
[1, 5, 6]])
3.3 生成固定范围的数组
# 生成等间隔的序列
np.linspace(0, 100, 11)
array([ 0., 10., 20., 30., 40., 50., 60., 70., 80., 90., 100.])
# 类似range
np.arange(10, 50, 2)
array([10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42,
44, 46, 48])
# 生成10^x
np.logspace(0, 2, 3)
array([ 1., 10., 100.])
3.4 生成随机数组
# 均匀分布
x1 = np.random.uniform(-1, 1, 1000000)
x1
array([-0.64396627, 0.45418466, 0.18690437, ..., 0.24564599,
-0.60393121, 0.42554518])
import matplotlib.pyplot as plt
plt.figure()
plt.hist(x=x1, bins=1000)
plt.show()
# 正太分布
x2 = np.random.normal(1.75, 1, 100000000)
plt.figure()
plt.hist(x2, 1000)
plt.show()
# 随机生成8只股票2周的交易日涨幅数据
stock_change = np.random.normal(0, 1, (8, 10))
stock_change
array([[-0.39505242, 0.90826077, -0.21782191, -2.579644 , -1.80536267,
0.0761092 , 0.22959365, -0.27668752, -0.61130237, -1.32768396],
[ 0.55786366, -0.52183062, -0.58764324, -0.83781052, 1.44944257,
0.30971043, -1.70501328, 0.11655112, -0.78857542, 0.15397265],
[-0.87916493, 0.16126835, -0.37054848, 0.20157712, 0.05141507,
0.84445257, 0.19210461, -0.12467897, 0.15308413, -0.45694432],
[ 1.25422764, -0.50528106, -1.74411687, -0.45714352, 0.19273938,
-0.27121733, 1.41678988, 1.77644812, 0.34032522, -0.29910002],
[-0.79216497, -0.98200844, 0.53838896, -1.53033656, -0.9257245 ,
-1.72211499, 1.38017619, 0.41968168, -1.7403562 , -0.72412437],
[-0.28304912, -1.64657941, -0.95650248, 0.64653616, -1.07762432,
-1.17113291, -0.71637017, 0.28357741, 0.40984885, -0.20729397],
[-2.13329804, -0.46327641, -0.66551884, 0.37347056, 0.95767147,
0.29573325, 1.72222028, 0.89640279, 0.66437657, 0.09138068],
[ 2.10734775, -0.67210195, 1.43512674, 2.28827819, -0.2605319 ,
-2.26863744, -0.84161926, -0.07581769, -1.65233412, 0.03896541]])
3.5 数组的索引、切片
# 二维的数组
stock_change[0, 0:3]
array([-0.39505242, 0.90826077, -0.21782191])
stock_change[3:, 3:]
array([[-0.45714352, 0.19273938, -0.27121733, 1.41678988, 1.77644812,
0.34032522, -0.29910002],
[-1.53033656, -0.9257245 , -1.72211499, 1.38017619, 0.41968168,
-1.7403562 , -0.72412437],
[ 0.64653616, -1.07762432, -1.17113291, -0.71637017, 0.28357741,
0.40984885, -0.20729397],
[ 0.37347056, 0.95767147, 0.29573325, 1.72222028, 0.89640279,
0.66437657, 0.09138068],
[ 2.28827819, -0.2605319 , -2.26863744, -0.84161926, -0.07581769,
-1.65233412, 0.03896541]])
stock_change[:-3, :-3]
array([[-0.39505242, 0.90826077, -0.21782191, -2.579644 , -1.80536267,
0.0761092 , 0.22959365],
[ 0.55786366, -0.52183062, -0.58764324, -0.83781052, 1.44944257,
0.30971043, -1.70501328],
[-0.87916493, 0.16126835, -0.37054848, 0.20157712, 0.05141507,
0.84445257, 0.19210461],
[ 1.25422764, -0.50528106, -1.74411687, -0.45714352, 0.19273938,
-0.27121733, 1.41678988],
[-0.79216497, -0.98200844, 0.53838896, -1.53033656, -0.9257245 ,
-1.72211499, 1.38017619]])
3.6 形状修改
stock_change.reshape([10, 8])
array([[-0.39505242, 0.90826077, -0.21782191, -2.579644 , -1.80536267,
0.0761092 , 0.22959365, -0.27668752],
[-0.61130237, -1.32768396, 0.55786366, -0.52183062, -0.58764324,
-0.83781052, 1.44944257, 0.30971043],
[-1.70501328, 0.11655112, -0.78857542, 0.15397265, -0.87916493,
0.16126835, -0.37054848, 0.20157712],
[ 0.05141507, 0.84445257, 0.19210461, -0.12467897, 0.15308413,
-0.45694432, 1.25422764, -0.50528106],
[-1.74411687, -0.45714352, 0.19273938, -0.27121733, 1.41678988,
1.77644812, 0.34032522, -0.29910002],
[-0.79216497, -0.98200844, 0.53838896, -1.53033656, -0.9257245 ,
-1.72211499, 1.38017619, 0.41968168],
[-1.7403562 , -0.72412437, -0.28304912, -1.64657941, -0.95650248,
0.64653616, -1.07762432, -1.17113291],
[-0.71637017, 0.28357741, 0.40984885, -0.20729397, -2.13329804,
-0.46327641, -0.66551884, 0.37347056],
[ 0.95767147, 0.29573325, 1.72222028, 0.89640279, 0.66437657,
0.09138068, 2.10734775, -0.67210195],
[ 1.43512674, 2.28827819, -0.2605319 , -2.26863744, -0.84161926,
-0.07581769, -1.65233412, 0.03896541]])
# -1表示待计算
stock_change.reshape([-1, 20])
array([[-0.39505242, 0.90826077, -0.21782191, -2.579644 , -1.80536267,
0.0761092 , 0.22959365, -0.27668752, -0.61130237, -1.32768396,
0.55786366, -0.52183062, -0.58764324, -0.83781052, 1.44944257,
0.30971043, -1.70501328, 0.11655112, -0.78857542, 0.15397265],
[-0.87916493, 0.16126835, -0.37054848, 0.20157712, 0.05141507,
0.84445257, 0.19210461, -0.12467897, 0.15308413, -0.45694432,
1.25422764, -0.50528106, -1.74411687, -0.45714352, 0.19273938,
-0.27121733, 1.41678988, 1.77644812, 0.34032522, -0.29910002],
[-0.79216497, -0.98200844, 0.53838896, -1.53033656, -0.9257245 ,
-1.72211499, 1.38017619, 0.41968168, -1.7403562 , -0.72412437,
-0.28304912, -1.64657941, -0.95650248, 0.64653616, -1.07762432,
-1.17113291, -0.71637017, 0.28357741, 0.40984885, -0.20729397],
[-2.13329804, -0.46327641, -0.66551884, 0.37347056, 0.95767147,
0.29573325, 1.72222028, 0.89640279, 0.66437657, 0.09138068,
2.10734775, -0.67210195, 1.43512674, 2.28827819, -0.2605319 ,
-2.26863744, -0.84161926, -0.07581769, -1.65233412, 0.03896541]])
# 会修改原数据
stock_change.resize([10, 8])
stock_change
array([[-0.39505242, 0.90826077, -0.21782191, -2.579644 , -1.80536267,
0.0761092 , 0.22959365, -0.27668752],
[-0.61130237, -1.32768396, 0.55786366, -0.52183062, -0.58764324,
-0.83781052, 1.44944257, 0.30971043],
[-1.70501328, 0.11655112, -0.78857542, 0.15397265, -0.87916493,
0.16126835, -0.37054848, 0.20157712],
[ 0.05141507, 0.84445257, 0.19210461, -0.12467897, 0.15308413,
-0.45694432, 1.25422764, -0.50528106],
[-1.74411687, -0.45714352, 0.19273938, -0.27121733, 1.41678988,
1.77644812, 0.34032522, -0.29910002],
[-0.79216497, -0.98200844, 0.53838896, -1.53033656, -0.9257245 ,
-1.72211499, 1.38017619, 0.41968168],
[-1.7403562 , -0.72412437, -0.28304912, -1.64657941, -0.95650248,
0.64653616, -1.07762432, -1.17113291],
[-0.71637017, 0.28357741, 0.40984885, -0.20729397, -2.13329804,
-0.46327641, -0.66551884, 0.37347056],
[ 0.95767147, 0.29573325, 1.72222028, 0.89640279, 0.66437657,
0.09138068, 2.10734775, -0.67210195],
[ 1.43512674, 2.28827819, -0.2605319 , -2.26863744, -0.84161926,
-0.07581769, -1.65233412, 0.03896541]])
# 转置
stock_change.shape
(10, 8)
stock_change.T.shape
(8, 10)
3.7 类型修改
stock_change.astype(np.int32)
array([[ 0, 0, 0, -2, -1, 0, 0, 0],
[ 0, -1, 0, 0, 0, 0, 1, 0],
[-1, 0, 0, 0, 0, 0, 0, 0],
[ 0, 0, 0, 0, 0, 0, 1, 0],
[-1, 0, 0, 0, 1, 1, 0, 0],
[ 0, 0, 0, -1, 0, -1, 1, 0],
[-1, 0, 0, -1, 0, 0, -1, -1],
[ 0, 0, 0, 0, -2, 0, 0, 0],
[ 0, 0, 1, 0, 0, 0, 2, 0],
[ 1, 2, 0, -2, 0, 0, -1, 0]], dtype=int32)
# 转换为bytes
arr = np.array([[[1, 2, 3], [4, 5, 6]], [[12, 3, 34], [5, 6, 7]]])
arr.tostring()
b'\x01\x00\x00\x00\x00\x00\x00\x00\x02\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00\x04\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x0c\x00\x00\x00\x00\x00\x00\x00\x03\x00\x00\x00\x00\x00\x00\x00"\x00\x00\x00\x00\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x06\x00\x00\x00\x00\x00\x00\x00\x07\x00\x00\x00\x00\x00\x00\x00'
3.8 数组的去重
temp = np.array([[1, 2, 3, 4], [3, 4, 5, 6]])
np.unique(temp)
array([1, 2, 3, 4, 5, 6])
4 ndarray运算
4.1 逻辑运算
stock_change = np.random.normal(0, 1, (8, 10))
tmp = stock_change[0:5, 0:5]
tmp
array([[-0.09452151, 0.21494329, -0.88702803, 1.27200298, 1.2467966 ],
[ 0.22267303, 0.2302336 , 0.07200738, 0.03890387, 0.3672513 ],
[ 1.16974078, 0.61063169, 0.7513086 , -2.85986462, 0.96363815],
[-0.89594979, -0.31032916, 1.75412849, -0.89782976, -2.49554601],
[-0.11609947, -0.45015486, 0.86779026, 0.99905792, 1.14721481]])
tmp > 0
array([[False, True, False, True, True],
[ True, True, True, True, True],
[ True, True, True, False, True],
[False, False, True, False, False],
[False, False, True, True, True]])
tmp[tmp > 0.5] = 1
tmp
array([[-0.09452151, 0.21494329, -0.88702803, 1. , 1. ],
[ 0.22267303, 0.2302336 , 0.07200738, 0.03890387, 0.3672513 ],
[ 1. , 1. , 1. , -2.85986462, 1. ],
[-0.89594979, -0.31032916, 1. , -0.89782976, -2.49554601],
[-0.11609947, -0.45015486, 1. , 1. , 1. ]])
4.2 通用判断函数
# 是否全大于0
np.all(stock_change[0:2, 0:5] > 0)
False
# 判断是否有大于0的
np.any(stock_change[0:2, 0:5] > 0)
True
4.3 三元运算符
# 大于0的置1,否则为0
temp = stock_change[:4, :4]
np.where(temp > 0, 1, 0)
array([[0, 1, 0, 1],
[1, 1, 1, 1],
[1, 1, 1, 0],
[0, 0, 1, 0]])
# 逻辑与
np.where(np.logical_and(temp > 0, temp < 0.5), 1, 0)
array([[0, 1, 0, 0],
[1, 1, 1, 1],
[0, 0, 0, 0],
[0, 0, 0, 0]])
# 逻辑或
np.where(np.logical_or(temp > 0.5, temp < - 0.5), 1, 0)
array([[0, 0, 1, 1],
[0, 0, 0, 0],
[1, 1, 1, 1],
[1, 0, 1, 1]])
4.4 统计运算
temp
array([[-0.09452151, 0.21494329, -0.88702803, 1. ],
[ 0.22267303, 0.2302336 , 0.07200738, 0.03890387],
[ 1. , 1. , 1. , -2.85986462],
[-0.89594979, -0.31032916, 1. , -0.89782976]])
# 最大值, axis=1为行
np.max(temp, axis=1)
array([1. , 0.2302336, 1. , 1. ])
# 最小值
np.min(temp, axis=1)
array([-0.88702803, 0.03890387, -2.85986462, -0.89782976])
# 中位数
np.median(temp, axis=1)
array([ 0.06021089, 0.1473402 , 1. , -0.60313948])
# 平均数
np.mean(temp, axis=1)
array([ 0.05834844, 0.14095447, 0.03503384, -0.27602718])
# 标准差
np.std(temp, axis=1)
array([0.67607978, 0.08633758, 1.67137041, 0.77465548])
# 方差
np.var(temp, axis=1)
array([0.45708387, 0.00745418, 2.79347904, 0.60009112])
# 返回最大索引
np.argmax(temp, axis=1)
array([3, 1, 0, 2])
# 返回最小索引
np.argmin(temp, axis=1)
array([2, 3, 3, 3])
5 数组间的运算
5.1 数组与数的运算
arr = np.array([[1,2,3,2,1,4], [5,6,1,2,3,1]])
arr
array([[1, 2, 3, 2, 1, 4],
[5, 6, 1, 2, 3, 1]])
arr + 1
array([[2, 3, 4, 3, 2, 5],
[6, 7, 2, 3, 4, 2]])
arr - 1
array([[0, 1, 2, 1, 0, 3],
[4, 5, 0, 1, 2, 0]])
arr * 2
array([[ 2, 4, 6, 4, 2, 8],
[10, 12, 2, 4, 6, 2]])
arr / 2
array([[0.5, 1. , 1.5, 1. , 0.5, 2. ],
[2.5, 3. , 0.5, 1. , 1.5, 0.5]])
5.2 数组与数组运算
# 1维数组能与任意数组运算
a = np.array([[1,2,3],[3,4,5]])
a
array([[1, 2, 3],
[3, 4, 5]])
b = np.array([2])
a + b
array([[3, 4, 5],
[5, 6, 7]])
print(a.shape)
print(b.shape)
(2, 3)
(1,)
# 维度一致也可运算
c = np.array([2,2,2])
print(c.shape)
c
(3,)
array([2, 2, 2])
a + c
array([[3, 4, 5],
[5, 6, 7]])
d = np.array([[1], [2]])
print(d.shape)
(2, 1)
a + d
array([[2, 3, 4],
[5, 6, 7]])
e = np.array([[1], [2], [3]])
e.shape
(3, 1)
a + e
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-138-dba895c55d37> in <module>
----> 1 a + e
ValueError: operands could not be broadcast together with shapes (2,3) (3,1)
6 矩阵运算
6.1 矩阵乘法
a = np.array([[80, 86],
[82, 80],
[85, 78],
[90, 90],
[86, 82],
[82, 90],
[78, 80],
[92, 94]])
b = np.array(([0.7], [0.3]))
np.matmul(a, b)
array([[81.8],
[81.4],
[82.9],
[90. ],
[84.8],
[84.4],
[78.6],
[92.6]])
80 * 0.7 + 86 * 0.3
81.8
np.dot(a, b)
array([[81.8],
[81.4],
[82.9],
[90. ],
[84.8],
[84.4],
[78.6],
[92.6]])
# 不支持标量与数组的乘法
np.matmul(a, 3)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-144-f843891049e9> in <module>
----> 1 np.matmul(a, 3)
ValueError: matmul: Input operand 1 does not have enough dimensions (has 0, gufunc core with signature (n?,k),(k,m?)->(n?,m?) requires 1)
np.dot(a, 3)
array([[240, 258],
[246, 240],
[255, 234],
[270, 270],
[258, 246],
[246, 270],
[234, 240],
[276, 282]])