Numpy基础(二)
文章目录
通用函数:快速的元素级数组函数
- NumPy提供一些通用函数(ufunc)对ndarray中的数据执行元素级运算,可以认为是简单函数矢量化包装器
- 这些ufunc对元组的计算结果返回的是新的array,不是操作视图
- Ufuncs可以接受一个out可选参数,这样就能在数组原地进行操作
import numpy as np
arr = np.arange(10)
print(arr)
print("\n")
print(np.sqrt(arr)) #对arr中每个元素计算平方根 等价于arr**0.5
print("\n")
print(np.exp(arr)) #计算各元素的指数e x 次方
print(np.sqrt(arr, out=arr,casting='unsafe')) #直接在arr上计算sqrt
# np.info(np.sqrt)
[0 1 2 3 4 5 6 7 8 9]
[0. 1. 1.41421356 1.73205081 2. 2.23606798
2.44948974 2.64575131 2.82842712 3. ]
[1.00000000e+00 2.71828183e+00 7.38905610e+00 2.00855369e+01
5.45981500e+01 1.48413159e+02 4.03428793e+02 1.09663316e+03
2.98095799e+03 8.10308393e+03]
[0 1 1 1 2 2 2 2 2 3]
- 有一些ufunc接受2个数组,即为二元ufunc,并返回一个结果数组:
- 虽然不常见,但是有些ufunc的确可以返回多个数组,modf可以返回浮点数数组的小数和整数部分
x =
y = range(8)
np.maximum(x,y) #比较并得到两个元组对应位置的最大值
array([0, 1, 2, 3, 4, 5, 6, 7])
arr = np.random.randn(7)*5
arr
array([-1.50098304, -4.45358491, -0.03869159, 5.58791479, -0.29705489,
-0.05445814, 2.35812865])
remainder, whole_part = np.modf(arr) #返回两个数组,小数和整数部分
print(remainder)
print(whole_part)
print(np.sqrt(arr, out = arr, casting = 'unsafe'))
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[0. 1. 1. 1. 2. 2. 2. 2. 2. 3.]
[0 1 1 1 1 1 1 1 1 1]
- 一些一元ufunc和二元ufunc
利用数组进行数据处理
- NumPy中可以将许多种数据处理任务表述为简洁的数组表达式,不需要写循环,比等价的纯python的方式快上一两个数量级。
points = np.arange(-5,5,0.01)
print(points.shape)
x, y = np.meshgrid(points,points) #np.meshgrid接受两个一维数组,并产生两个二维矩阵
print(x)
print(y)
z = np.sqrt(x**2 + y**2) #计算这两个矩阵的sqrt(x2+y2),只需编写表达式即可
print(z)
import matplotlib.pyplot as plt
plt.imshow(z,cmap=plt.cm.gray)
plt.colorbar()
(1000,)
[[-5. -4.99 -4.98 ... 4.97 4.98 4.99]
[-5. -4.99 -4.98 ... 4.97 4.98 4.99]
[-5. -4.99 -4.98 ... 4.97 4.98 4.99]
...
[-5. -4.99 -4.98 ... 4.97 4.98 4.99]
[-5. -4.99 -4.98 ... 4.97 4.98 4.99]
[-5. -4.99 -4.98 ... 4.97 4.98 4.99]]
[[-5. -5. -5. ... -5. -5. -5. ]
[-4.99 -4.99 -4.99 ... -4.99 -4.99 -4.99]
[-4.98 -4.98 -4.98 ... -4.98 -4.98 -4.98]
...
[ 4.97 4.97 4.97 ... 4.97 4.97 4.97]
[ 4.98 4.98 4.98 ... 4.98 4.98 4.98]
[ 4.99 4.99 4.99 ... 4.99 4.99 4.99]]
[[7.07106781 7.06400028 7.05693985 ... 7.04988652 7.05693985 7.06400028]
[7.06400028 7.05692568 7.04985815 ... 7.04279774 7.04985815 7.05692568]
[7.05693985 7.04985815 7.04278354 ... 7.03571603 7.04278354 7.04985815]
...
[7.04988652 7.04279774 7.03571603 ... 7.0286414 7.03571603 7.04279774]
[7.05693985 7.04985815 7.04278354 ... 7.03571603 7.04278354 7.04985815]
[7.06400028 7.05692568 7.04985815 ... 7.04279774 7.04985815 7.05692568]]
<matplotlib.colorbar.Colorbar at 0x1ee84d497b8>
将条件逻辑表述为数组运算
- np.where函数时三元表达式x if condition else y的矢量化版本
xarr = np.array([1,2,3,4,5,6,7])
yarr = np.array([7,6,5,4,3,2,1])
cond = np.array([True,False, True, False,True,False,True])
- 依据cond内容,True选xarr,False选yarr
- 使用np.where可以满足,速度也很快,不需要写循环。这个要记住,这个思路以后可能会经常用到
- where的第二个和第三个参数不必是数组,都可以是标量值。
- 返回新的数组,不是操作视图
print(np.where(cond,xarr,yarr))
arr = np.random.randn(4,4)
print(arr)
print(np.where(arr>1,2,-2)) #将arr中大于1的值赋值为2,否则为-2
print(np.where(arr>0,2,arr)) #用2替换arr中所有正值
arr
[1 6 3 4 5 2 7]
[[ 0.19435846 0.28274232 0.32573511 0.28187589]
[-0.72749785 1.08337871 -0.8629885 -1.51879924]
[-1.77115061 0.46954324 1.21010986 -1.3469705 ]
[-0.22028672 0.4561393 -0.88862844 -0.08331455]]
[[-2 -2 -2 -2]
[-2 2 -2 -2]
[-2 -2 2 -2]
[-2 -2 -2 -2]]
[[ 2. 2. 2. 2. ]
[-0.72749785 2. -0.8629885 -1.51879924]
[-1.77115061 2. 2. -1.3469705 ]
[-0.22028672 2. -0.88862844 -0.08331455]]
array([[ 0.19435846, 0.28274232, 0.32573511, 0.28187589],
[-0.72749785, 1.08337871, -0.8629885 , -1.51879924],
[-1.77115061, 0.46954324, 1.21010986, -1.3469705 ],
[-0.22028672, 0.4561393 , -0.88862844, -0.08331455]])
数学和统计方法
- 通过数学函数对整个数组或某个轴向的数据进行统计计算
- mean、sum等聚合计算函数(又称约简),既可以当做数组的实例方法调用,也可以当做顶级Numpy函数使用
arr = np.random.randn(5, 4)
print(arr)
print(arr.mean()) #对数组的元素进行平均值聚合
print(arr.sum())
[[ 1.46210794 -2.06014071 -0.3224172 -0.38405435]
[ 1.13376944 -1.09989127 -0.17242821 -0.87785842]
[ 0.04221375 0.58281521 -1.10061918 1.14472371]
[ 0.90159072 0.50249434 0.90085595 -0.68372786]
[-0.12289023 -0.93576943 -0.26788808 0.53035547]]
-0.041337920580385025
-0.8267584116077005
- 像mean这类的函数可以接受一个axis参数,可以计算该轴向上的统计值
print(arr.mean(axis=1)) #计算每行的平均值
print(arr.mean(axis=0)) #计算每列的平均值
arr_3 = np.random.randn(5,4,3)
print(arr_3)
print('\n')
print(arr_3.mean(0)) #计算纵轴对应位置和的平均值
print('\n')
print(arr_3.mean(1)) #计算每个子元组列的平均值
print('\n')
print(arr_3.mean(2)) #计算每个子元组行的平均值
[-0.06994235 0.2412955 0.08147337 0.56730433 -1.00257953]
[-0.20815596 0.47114449 0.33731851 -0.746266 ]
[[[ 0.07830096 -1.96180286 0.89118881]
[-2.49571863 1.05513663 0.38961159]
[ 0.85803898 1.50682098 0.67111429]
[-1.35147504 -0.72720465 -2.21545383]]
[[ 0.25962478 -0.34768104 -0.67087216]
[-0.21714028 0.44035738 -0.00776045]
[-1.90504688 -0.52714388 0.78302239]
[ 0.19075313 0.54157796 -1.03345287]]
[[ 1.27195894 0.46014859 0.00887002]
[ 1.81348694 2.04876962 -0.10752802]
[ 0.34766676 -1.40041568 0.81914608]
[-0.70296663 -0.17881243 1.11952713]]
[[-0.94365226 0.65744187 -1.11665377]
[ 2.03427083 -0.757184 0.25056046]
[ 0.15697967 -1.80151228 -0.96551638]
[-1.22388607 -1.53884369 0.74597209]]
[[-1.67295701 -0.86748644 -0.31351573]
[-0.17234924 -0.78582872 -0.87690668]
[-1.69154251 0.11910277 -0.57007868]
[-0.32087573 -0.2288285 -1.52472551]]]
[[-0.20134492 -0.41187598 -0.24019657]
[ 0.19250992 0.40025018 -0.07040462]
[-0.4467808 -0.42062962 0.14753754]
[-0.68169007 -0.42642226 -0.5816266 ]]
[[-0.72771343 -0.03176248 -0.06588479]
[-0.41795231 0.0267776 -0.23226577]
[ 0.6825365 0.23242253 0.4600038 ]
[ 0.00592804 -0.86002452 -0.2714094 ]
[-0.96443112 -0.44076022 -0.82130665]]
[[-0.33077103 -0.35032347 1.01199142 -1.43137784]
[-0.25297614 0.07181888 -0.54972279 -0.10037393]
[ 0.58032585 1.25157618 -0.07786761 0.07924936]
[-0.46762139 0.50921576 -0.87001633 -0.67225256]
[-0.95131973 -0.61169488 -0.71417281 -0.69147658]]
- cumsum和cumprod之类的方法不聚合,而是产生一个由中间结果组成的数组
print(arr.cumsum(0)) #纵向累加
print(arr.cumsum())
print(arr.cumprod(0)) #纵向累乘
[[ 1.46210794 -2.06014071 -0.3224172 -0.38405435]
[ 2.59587738 -3.16003198 -0.49484541 -1.26191277]
[ 2.63809113 -2.57721676 -1.59546459 -0.11718906]
[ 3.53968185 -2.07472242 -0.69460864 -0.80091692]
[ 3.41679162 -3.01049186 -0.96249672 -0.27056146]]
[ 1.46210794 -0.59803277 -0.92044998 -1.30450433 -0.17073489 -1.27062616
-1.44305436 -2.32091278 -2.27869903 -1.69588382 -2.796503 -1.65177929
-0.75018857 -0.24769423 0.65316172 -0.03056614 -0.15345636 -1.0892258
-1.35711388 -0.82675841]
[[ 1.46210794 -2.06014071 -0.3224172 -0.38405435]
[ 1.6576933 2.26593078 0.05559382 0.33714535]
[ 0.06997745 1.32061893 -0.06118763 0.38593827]
[ 0.06309102 0.66360354 -0.05512124 -0.26387675]
[-0.00775327 -0.62097991 0.01476632 -0.13994848]]
- 基本数组统计方法列表
用于布尔型数组的方法
- 上述方法对于数组中的布尔值会被强制转换为1(True)和0(False)
arr = np.random.randn(100)
print(arr)
print((arr>0).sum()) #计算arr中正值的数量
[-1.1335026 0.72343403 0.3465952 1.14244279 -1.27073565 -0.17111867
-0.3021788 1.50579257 0.7399095 0.79580789 -0.36452434 -0.49880213
2.20379505 -0.72577507 -1.08674631 0.5665405 -1.61739984 -0.75306127
-0.61559106 1.5810625 1.23050147 -1.13727769 -0.89395162 0.72485717
-0.06958396 -1.99434711 0.94882271 1.20435365 1.29296901 0.73033161
0.77468725 1.47184524 -0.07643467 0.72767344 0.64711457 -0.05419534
-1.14154973 -0.65191275 1.95098023 -0.45654514 0.02017175 -0.73657685
-0.22851128 0.60023293 0.55969221 -0.56932745 -1.35320988 1.13538343
-0.56742239 0.5195742 0.64113521 -0.1158107 1.50748428 2.41724119
0.79483581 0.90776966 0.10871141 -1.45992909 -0.41465447 1.1783231
-0.85700521 -3.38753961 -1.731907 -0.17761624 -1.89504539 -0.46823973
0.24277589 -0.38161382 0.13523375 -1.19156221 0.3318844 -0.29036875
1.21432632 0.63703822 -1.63746259 -0.87735742 0.57171971 1.84615953
2.08850964 0.32234139 -1.12732638 -0.35804732 1.33035368 -0.87435126
-0.40368867 -0.50802595 0.53276008 1.80532111 1.7578126 1.18043273
0.53178291 -1.10813947 -0.54112591 -2.11646532 1.35736663 0.35937393
0.61861754 -0.5248042 -2.07384679 -0.43362903]
50
- any用于检查数组中是否存在一个或多个True
- all检查数组中所有值是否都是True
- 二者可用于非布尔型数组,非0元素都会被当做True
arr.any()
True
arr.all()
True
排序
- sort方法就地排序
- np.sort返回数组已排序的副本
- 多维数组可以在任一轴向排序
arr = [1,2,2,1,4,5,6]
print(np.sort(arr))
print(arr.sort( )) #不知道为什么是空
arr_2 = np.sort(arr)
arr_2[int(0.05*len(arr))] 5%分位数
[1 1 2 2 4 5 6]
None
1
唯一化和其他集合逻辑
- 一些针对于一维ndarray的基本集合运算
- np.unique找出数组中的唯一值并返回已排序结果
- np.in1d测试一个数组中的值在另一个数组中的成员资格
ints = np.array([1,2,3,1,2,5,6,3,1])
np.unique(ints)
array([1, 2, 3, 5, 6])
np.in1d(ints, [5,6,7])
array([False, False, False, False, False, True, True, False, False])
- 数组的集合运算方法
用于数组的文件输入输出
将数组以二进制形式保存到磁盘
- np.save和np.load分别为写读磁盘数组数据的两个主要函数
- 默认情况下,数组是以未压缩的原始二进制格式保存在扩展名为.npy的文件中
- np.savez也可以将多个数组保存到一个压缩文件中,数组以关键字参数的形式传入即可,读取时会返回类似字典的对象
arr = np.random.randn(8,4)
np.save('test_save', arr) #保存数组到二进制文件
np.load('test_save.npy') #读取二进制数组
array([[-0.65598051, 0.2311896 , 0.09114272, 0.92067987],
[-0.93836684, -0.79460303, 0.3342222 , 0.36203406],
[-1.19931299, 0.73225071, 0.52817081, 1.65447819],
[-1.21049035, 0.32960405, 0.36710208, -1.09386798],
[ 1.53483597, -0.42580468, -0.52048361, -1.07995041],
[-0.73761528, -0.98686676, -1.99642732, 1.1810512 ],
[ 0.21913021, 1.64000551, 0.42291343, 1.58658158],
[-0.85473226, 0.40523509, -1.6332945 , -0.54984155]])
arr1 = np.random.randn(3,4)
np.savez("test_multiply", a=arr, b=arr1) #以关键字参数形式传入多个数组
arch = np.load('test_multiply.npz')
arch['a'] #返回类似字典的对象
array([[-0.65598051, 0.2311896 , 0.09114272, 0.92067987],
[-0.93836684, -0.79460303, 0.3342222 , 0.36203406],
[-1.19931299, 0.73225071, 0.52817081, 1.65447819],
[-1.21049035, 0.32960405, 0.36710208, -1.09386798],
[ 1.53483597, -0.42580468, -0.52048361, -1.07995041],
[-0.73761528, -0.98686676, -1.99642732, 1.1810512 ],
[ 0.21913021, 1.64000551, 0.42291343, 1.58658158],
[-0.85473226, 0.40523509, -1.6332945 , -0.54984155]])
存取文本文件
- np.loadtxt加载文本文件
- np.savetxt保存文本文件,以某种分隔符隔开
- np.genfromtxt面向结构化数组和缺失数据处理,与loadtxt类似
np.savetxt('test_txt.txt',arr, delimiter=',') #保存arr到文本文件
arr_read = np.loadtxt('test_txt.txt', delimiter=',')
print(arr_read)
[[-0.65598051 0.2311896 0.09114272 0.92067987]
[-0.93836684 -0.79460303 0.3342222 0.36203406]
[-1.19931299 0.73225071 0.52817081 1.65447819]
[-1.21049035 0.32960405 0.36710208 -1.09386798]
[ 1.53483597 -0.42580468 -0.52048361 -1.07995041]
[-0.73761528 -0.98686676 -1.99642732 1.1810512 ]
[ 0.21913021 1.64000551 0.42291343 1.58658158]
[-0.85473226 0.40523509 -1.6332945 -0.54984155]]
print(np.ones(3))
[1. 1. 1.]
线性代数
- Numpy.linalg有一些最常用的线性代数函数(矩阵乘法、矩阵分解、行列式以及其他方阵数学等),对于数组的操作非常重要
- 常见的numpy.linalg函数
x = np.array([[1., 2., 3.], [4., 5., 6.]])
y = np.array([[6., 23.], [-1, 7], [8, 9]])
print(x.dot(y)) #矩阵点积
print(np.dot(x, np.ones(3)))
[[ 28. 64.]
[ 67. 181.]]
[ 6. 15.]
from numpy.linalg import inv,qr
X = np.random.randn(5,5)
mat = X.T.dot(X)
print(inv(mat)) #计算方阵的逆
[[ 0.22853744 -0.07891849 0.12043086 -0.20498048 0.1283345 ]
[-0.07891849 0.52982086 -0.07830477 0.18856211 -0.20667386]
[ 0.12043086 -0.07830477 0.16472534 -0.21537327 0.14842295]
[-0.20498048 0.18856211 -0.21537327 0.77848939 -0.3861419 ]
[ 0.1283345 -0.20667386 0.14842295 -0.3861419 0.41114584]]
q,r = qr(mat)
print(q) #正交矩阵
print(r) #上三角矩阵
[[-0.85378122 0.00440576 -0.47243889 0.08123632 0.20307727]
[-0.0254611 -0.89114067 -0.04965526 0.30951247 -0.32704193]
[ 0.51379591 -0.04579852 -0.82344039 0.02646601 0.23486536]
[-0.07985704 -0.01935158 -0.24706033 -0.74756134 -0.61103322]
[-0.0067806 -0.45097329 0.18773113 -0.58142435 0.65059961]]
[[-8.64036943 -0.27127236 10.34474286 -0.07564281 -1.26133882]
[ 0. -2.63608143 0.24333525 -1.05577151 -3.50137691]
[ 0. 0. -9.29578139 -1.86856972 2.05743395]
[ 0. 0. 0. -3.1109436 -4.33590672]
[ 0. 0. 0. 0. 1.58240592]]
随机数生成
- numpy.random提供了一些高效的生成多种概率分布样本值的函数
- python内置的random模块则一次只能生成一个样本值
- 可以用NumPy的np.random.seed更改随机数生成种子
- numpy.random的数据生成函数使用了全局的随机种子。要避免全局状态,你可以使用numpy.random.RandomState,创建一个与其它隔离的随机数生成器
- 一些numpy.random函数:
from random import normalvariate
N = 1000000
%timeit samples = [normalvariate(0,1) for _ in range(N)]
1.18 s ± 42.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit np.random.normal(size=N)
38 ms ± 1.43 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
np.random.seed(1) #全局种子
print(np.random.randn(10))
rng = np.random.RandomState(12) #局部种子
print(rng.randn(10))
[ 1.62434536 -0.61175641 -0.52817175 -1.07296862 0.86540763 -2.3015387
1.74481176 -0.7612069 0.3190391 -0.24937038]
[ 0.47298583 -0.68142588 0.2424395 -1.70073563 0.75314283 -1.53472134
0.00512708 -0.12022767 -0.80698188 2.87181939]
随机漫步
从0开始,步长出现1和-1的概率相等,以纯Pyhon和numpy两种方式实现1000步的随机漫步
%matplotlib notebook
import random
import matplotlib.pyplot as plt
#纯python实现随机漫步
position = 0
walk = [position]
steps = 1000
for i in range(steps):
step = 1 if random.randint(0,1) else -1
position += step
walk.append(position)
plt.plot(walk[:100])
<IPython.core.display.Javascript object>
[<matplotlib.lines.Line2D at 0x20392336e48>]
- numpy实现随机漫步
nsteps = 1000
draws = np.random.randint(0,2, size=nsteps)
steps = np.where(draws>0, 1, -1)
walk = steps.cumsum()
print(np.abs(walk)>10) #查询绝对值大于10的所有点
print((np.abs(walk)>10).argmax()) #返回布尔型数组第一个True的索引
plt.plot(walk[:200])
60
<IPython.core.display.Javascript object>
[<matplotlib.lines.Line2D at 0x20393232c50>]
一次模拟多个随机漫步
nwalks = 5000 #5000个随机漫步
nsteps = 1000 #1000步
draws = np.random.randint(0,2,size=(nwalks, nsteps)) #0或1
print(draws)
steps = np.where(draws>0, 1, -1)
print(steps)
walks = steps.cumsum(1) #每一行累加
print(walks)
#计算50或-50的最小穿越时间
'''
TODO:
1.判断是否5000个过程都到达了50
2.
'''
hits50 = (np.abs(walks)>=50).any(1) #5000行哪行有超过50的数
print(hits50.shape)
print(hits50.sum())
print(walks[hits50].shape) #筛选walks中超过50的那些行
crossing_times = (np.abs(walks[hits50]) >= 50).argmax(1)
print(crossing_times) #每一行最早出现50的索引值
print(crossing_times.mean())
[[0 1 1 ... 1 1 0]
[1 0 0 ... 1 0 1]
[1 0 1 ... 0 1 0]
...
[0 0 0 ... 1 1 0]
[1 0 1 ... 1 0 1]
[0 1 1 ... 1 0 1]]
[[-1 1 1 ... 1 1 -1]
[ 1 -1 -1 ... 1 -1 1]
[ 1 -1 1 ... -1 1 -1]
...
[-1 -1 -1 ... 1 1 -1]
[ 1 -1 1 ... 1 -1 1]
[-1 1 1 ... 1 -1 1]]
[[ -1 0 1 ... 28 29 28]
[ 1 0 -1 ... -14 -15 -14]
[ 1 0 1 ... 14 15 14]
...
[ -1 -2 -3 ... -28 -27 -28]
[ 1 0 1 ... -52 -53 -52]
[ -1 0 1 ... -54 -55 -54]]
(5000,)
1156
(1156, 1000)
[835 839 669 ... 507 459 763]
667.2820069204153