文章目录
GitHub: https://github.com/RealEmperor/Python-for-Data-Analysis
numpy
import numpy as np
from numpy.random import randn
#通用函数
arr = np.arange(10)
np.sqrt(arr)
array([ 0. , 1. , 1.41421356, 1.73205081, 2. ,
2.23606798, 2.44948974, 2.64575131, 2.82842712, 3. ])
np.exp(arr)
array([ 1.00000000e+00, 2.71828183e+00, 7.38905610e+00,
2.00855369e+01, 5.45981500e+01, 1.48413159e+02,
4.03428793e+02, 1.09663316e+03, 2.98095799e+03,
8.10308393e+03])
np.maximum 元素级最大值
x = randn(8)
y = randn(8)
print(x)
print(y)
# 元素级最大值
np.maximum(x, y)
[-1.03760196 -1.0035245 -0.19109603 2.27398057 -0.51605815 -1.25481649
-1.95118717 -0.09423245]
[-1.26195712 -0.70857631 -0.18729477 2.58847014 2.46277713 -1.04523397
1.13501218 1.3499591 ]
array([-1.03760196, -0.70857631, -0.18729477, 2.58847014, 2.46277713,
-1.04523397, 1.13501218, 1.3499591 ])
np.modf 按元素返回数组的小数部分和整数部分
arr = randn(7) * 5
print(arr)
# 按元素返回数组的小数部分和整数部分
np.modf(arr)
[ 8.01175821 3.46248512 -4.11785287 1.34226648 0.40194097 5.81213218
-0.40446832]
(array([ 0.01175821, 0.46248512, -0.11785287, 0.34226648, 0.40194097,
0.81213218, -0.40446832]), array([ 8., 3., -4., 1., 0., 5., -0.]))
np.meshgrid 从坐标向量返回坐标矩阵
###利用数组进行数据处理
# 向量化
points = np.arange(-5, 5, 0.01) # 1000 equally spaced points
# 从坐标向量返回坐标矩阵
xs, ys = np.meshgrid(points, points)
print(ys)
[[-5. -5. -5. ..., -5. -5. -5. ]
[-4.99 -4.99 -4.99 ..., -4.99 -4.99 -4.99]
[-4.98 -4.98 -4.98 ..., -4.98 -4.98 -4.98]
...,
[ 4.97 4.97 4.97 ..., 4.97 4.97 4.97]
[ 4.98 4.98 4.98 ..., 4.98 4.98 4.98]
[ 4.99 4.99 4.99 ..., 4.99 4.99 4.99]]
import matplotlib.pyplot as plt
z = np.sqrt(xs ** 2 + ys ** 2)
print(z)
plt.imshow(z, cmap=plt.cm.gray)
plt.colorbar()
plt.title("Image plot of $\sqrt{x^2 + y^2}$ for a grid of values")
plt.draw()
[[ 7.07106781 7.06400028 7.05693985 ..., 7.04988652 7.05693985
7.06400028]
[ 7.06400028 7.05692568 7.04985815 ..., 7.04279774 7.04985815
7.05692568]
[ 7.05693985 7.04985815 7.04278354 ..., 7.03571603 7.04278354
7.04985815]
...,
[ 7.04988652 7.04279774 7.03571603 ..., 7.0286414 7.03571603
7.04279774]
[ 7.05693985 7.04985815 7.04278354 ..., 7.03571603 7.04278354
7.04985815]
[ 7.06400028 7.05692568 7.04985815 ..., 7.04279774 7.04985815
7.05692568]]
np.where
# 将条件逻辑表达为数组运算
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])
result = [(x if c else y)
for x, y, c in zip(xarr, yarr, cond)]
print(result)
[1.1000000000000001, 2.2000000000000002, 1.3, 1.3999999999999999, 2.5]
result = np.where(cond, xarr, yarr)
print(result)
[ 1.1 2.2 1.3 1.4 2.5]
arr = randn(4, 4)
print(arr)
print(np.where(arr > 0, 2, -2))
print(np.where(arr > 0, 2, arr)) # set only positive values to 2
[[-0.09677059 -0.78473401 -0.00841639 1.39892368]
[-1.14999224 0.33586593 -0.1844864 0.47664971]
[-0.67508722 0.56130304 -0.8018509 0.07338623]
[ 0.10375292 1.44174994 0.42788598 -0.66850794]]
[[-2 -2 -2 2]
[-2 2 -2 2]
[-2 2 -2 2]
[ 2 2 2 -2]]
[[-0.09677059 -0.78473401 -0.00841639 2. ]
[-1.14999224 2. -0.1844864 2. ]
[-0.67508722 2. -0.8018509 2. ]
[ 2. 2. 2. -0.66850794]]
"""
# 多条件一般表示方法
# Not to be executed
result = []
for i in range(n):
if cond1[i] and cond2[i]:
result.append(0)
elif cond1[i]:
result.append(1)
elif cond2[i]:
result.append(2)
else:
result.append(3)
# 多条件where表示方法
# Not to be executed
np.where(cond1 & cond2, 0,
np.where(cond1, 1,
np.where(cond2, 2, 3)))
# Not to be executed
result = 1 * cond1 + 2 * cond2 + 3 * -(cond1 | cond2)
"""
'\n# 多条件一般表示方法\n# Not to be executed\nresult = []\nfor i in range(n):\n if cond1[i] and cond2[i]:\n result.append(0)\n elif cond1[i]:\n result.append(1)\n elif cond2[i]:\n result.append(2)\n else:\n result.append(3)\n\n# 多条件where表示方法\n# Not to be executed\nnp.where(cond1 & cond2, 0,\n np.where(cond1, 1,\n np.where(cond2, 2, 3)))\n\n# Not to be executed\nresult = 1 * cond1 + 2 * cond2 + 3 * -(cond1 | cond2)\n'
数学与统计方法
randn 标准正态分布数据
# 数学与统计方法
arr = np.random.randn(5, 4) # 标准正态分布数据
print(arr.mean())
print(np.mean(arr))
print(arr.sum())
print(arr.mean(axis=1))
print(arr.sum(0))
arr = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
print(arr.cumsum(0))
print(arr.cumprod(1))
0.299738473867
0.299738473867
5.99476947734
[ 0.33172725 -0.49981575 0.35973217 0.39621625 0.91083245]
[ 4.58629248 2.22968175 -1.88744743 1.06624268]
[[ 0 1 2]
[ 3 5 7]
[ 9 12 15]]
[[ 0 0 0]
[ 3 12 60]
[ 6 42 336]]
用于布尔型数组的方法
# 用于布尔型数组的方法
arr = randn(100)
(arr > 0).sum() # 正值的数量
bools = np.array([False, False, True, False])
print(bools.any())
print(bools.all())
True
False
排序
# 排序
arr = randn(8)
print(arr)
arr.sort()
print(arr)
arr = randn(5, 3)
print(arr)
arr.sort(1)
print(arr)
[-0.17018254 1.29292169 1.87999871 -0.25529225 1.1058983 -0.27456269
-1.17911236 0.30155365]
[-1.17911236 -0.27456269 -0.25529225 -0.17018254 0.30155365 1.1058983
1.29292169 1.87999871]
[[-0.31552106 0.95227657 0.08006334]
[ 0.86493167 0.66028869 0.56929258]
[-1.30046025 -1.03020373 -0.80371581]
[-0.74412785 0.2413104 -0.81418268]
[-1.16001837 -0.70517682 -0.5816708 ]]
[[-0.31552106 0.08006334 0.95227657]
[ 0.56929258 0.66028869 0.86493167]
[-1.30046025 -1.03020373 -0.80371581]
[-0.81418268 -0.74412785 0.2413104 ]
[-1.16001837 -0.70517682 -0.5816708 ]]
5%分位数
large_arr = randn(1000)
large_arr.sort()
large_arr[int(0.05 * len(large_arr))] # 5%分位数
-1.7061490455426676
np.unique 唯一化 以及其他的集合逻辑
# 唯一化以及其他的集合逻辑
names = np.array(['Bob', 'Joe', 'Will', 'Bob', 'Will', 'Joe', 'Joe'])
np.unique