14天数据分析与机器学习实践之Day01——科学计算库Nmupy应用总结

本文链接：https://blog.csdn.net/qq_45817449/article/details/107321683

14天数据分析与机器学习实践之Day01——科学计算库Nmupy

Numpy是Python中科学计算的基础包。它是一个Python库，提供多维数组对象，各种派生对象（如掩码数组和矩阵），以及用于数组快速操作的各种API，有包括数学、逻辑、形状操作、排序、选择、输入输出、离散傅立叶变换、基本线性代数，基本统计运算和随机模拟等等。

numpy.genfromtxt

import numpy
titanic=numpy.genfromtxt("titanic.txt", delimiter=",",dtype=str,skip_header=1)
print(type(titanic))
print(titanic)

dtype转换数据类型，不设置dtype，输出数据类型为str
delimiter=’,'表示数据由逗号分隔
skip_header关键字可以设置为整数，表示跳过文件开头对应的行数

numpy.zero
用于创建元素值为0的矩阵

np.zeros ((3,4)) 
#output
array([[ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.]])

numpy.ones
用于创建元素为1的矩阵

np.ones( (2,3,4), dtype=np.int32 )
#output
array([[[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]],

       [[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]]])

numpy.arange
创建数组(等差)

#To create sequences of numbers
np.arange( 10, 30, 5 )
#从10起始到30结束(小于30)，每个加5
#output
array([10, 15, 20, 25])

numpy.random
创建数组(随机)

np.random.random((2,3))
#output
array([[ 0.40130659,  0.45452825,  0.79776512],
       [ 0.63220592,  0.74591134,  0.64130737]])

numpy.linspace

from numpy import pi
np.linspace( 0, 2*pi, 100 )
#从0到2*pi去100个均值
#output
array([ 0.        ,  0.06346652,  0.12693304,  0.19039955,  0.25386607,
        0.31733259,  0.38079911,  0.44426563,  0.50773215,  0.57119866,
        0.63466518,  0.6981317 ,  0.76159822,  0.82506474,  0.88853126,
        0.95199777,  1.01546429,  1.07893081,  1.14239733,  1.20586385,
        1.26933037,  1.33279688,  1.3962634 ,  1.45972992,  1.52319644,
        1.58666296,  1.65012947,  1.71359599,  1.77706251,  1.84052903,
        1.90399555,  1.96746207,  2.03092858,  2.0943951 ,  2.15786162,
        2.22132814,  2.28479466,  2.34826118,  2.41172769,  2.47519421,
        2.53866073,  2.60212725,  2.66559377,  2.72906028,  2.7925268 ,
        2.85599332,  2.91945984,  2.98292636,  3.04639288,  3.10985939,
        3.17332591,  3.23679243,  3.30025895,  3.36372547,  3.42719199,
        3.4906585 ,  3.55412502,  3.61759154,  3.68105806,  3.74452458,
        3.8079911 ,  3.87145761,  3.93492413,  3.99839065,  4.06185717,
        4.12532369,  4.1887902 ,  4.25225672,  4.31572324,  4.37918976,
        4.44265628,  4.5061228 ,  4.56958931,  4.63305583,  4.69652235,
        4.75998887,  4.82345539,  4.88692191,  4.95038842,  5.01385494,
        5.07732146,  5.14078798,  5.2042545 ,  5.26772102,  5.33118753,
        5.39465405,  5.45812057,  5.52158709,  5.58505361,  5.64852012,
        5.71198664,  5.77545316,  5.83891968,  5.9023862 ,  5.96585272,
        6.02931923,  6.09278575,  6.15625227,  6.21971879,  6.28318531])

dot和*
*为对应位置直接相乘
dot为矩阵乘法

#The matrix product can be performed using the dot function or method
A = np.array( [[1,1],
               [0,1]] )
B = np.array( [[2,0],
               [3,4]] )
print A
print B
#print A*B
print A.dot(B)
print np.dot(A, B) 
#output
[[1 1]
 [0 1]]
 
[[2 0]
 [3 4]]
 
[[5 4]
 [3 4]]
 
[[5 4]
 [3 4]]

矩阵的乘方开方

import numpy as np
B = np.arange(3)
print B
print np.exp(B)#乘方
print np.sqrt(B)#开方

生成随机矩阵

#Return the floor of the input生成一个3*4随机矩阵
a = np.floor(10*np.random.random((3,4)))
#将矩阵拉平变为1维
print a.ravel()

矩阵拼接
hstack横着拼
vstack竖着拼

a = np.floor(10*np.random.random((2,2)))
b = np.floor(10*np.random.random((2,2)))
print a
print '---'
print b
print '---'
print np.hstack((a,b))
#output
[[ 5.  6.]
 [ 1.  5.]]
---
[[ 8.  6.]
 [ 9.  0.]]
---
[[ 5.  6.  8.  6.]
 [ 1.  5.  9.  0.]]

矩阵切分
hsplit横着切
vsplit竖着切

a = np.floor(10*np.random.random((2,12)))
#print a
#print np.hsplit(a,3)
#print np.hsplit(a,(3,4))   # Split a after the third and the fourth column
a = np.floor(10*np.random.random((12,2)))
print a
np.vsplit(a,3)
#output
[[ 5.  2.]
 [ 1.  3.]
 [ 9.  6.]
 [ 2.  2.]
 [ 7.  2.]
 [ 8.  2.]
 [ 1.  7.]
 [ 2.  8.]
 [ 4.  4.]
 [ 8.  5.]
 [ 4.  3.]
 [ 2.  3.]]

[array([[ 5.,  2.],
        [ 1.,  3.],
        [ 9.,  6.],
        [ 2.,  2.]]), array([[ 7.,  2.],
        [ 8.,  2.],
        [ 1.,  7.],
        [ 2.,  8.]]), array([[ 4.,  4.],
        [ 8.,  5.],
        [ 4.,  3.],
        [ 2.,  3.]])]

numpy.view浅复制
numpy.copy深复制

取列值最大值

import numpy as np
#data = np.sin(np.arange(20)).reshape(5,4)
#print data
#ind = data.argmax(axis=0)
#print ind
data_max = data[ind, xrange(data.shape[1])]#取列值最大值
print data_max
all(data_max == data.max(axis=0))#Ture

行列成倍增加

a = np.arange(0, 40, 10)
b = np.tile(a, (3, 5)) #行*5列*3
print b
#output
[[ 0 10 20 30  0 10 20 30  0 10 20 30  0 10 20 30  0 10 20 30]
 [ 0 10 20 30  0 10 20 30  0 10 20 30  0 10 20 30  0 10 20 30]
 [ 0 10 20 30  0 10 20 30  0 10 20 30  0 10 20 30  0 10 20 30]]

排序

import numpy as np
a = np.array([[4, 3, 5], [1, 2, 1]])
print (a)

a.sort(axis=1)
print(a)
a = np.array([4, 3, 1, 2])
j = np.argsort(a)
print (j)
print (a[j])
#output
[[4 3 5]
 [1 2 1]]
[[3 4 5]
 [1 1 2]]
[2 3 1 0]
[1 2 3 4]

基础操作

import numpy as np
a = np.array(['a','s','d','f'])
b = np.array([1,2,3,4])
c = np.array([4,5,6,7])
d = np.eye(4)

shape(各维度长度)
shape返回一个元组，列出每个维度的数组长度。

a.shape
#output
(1,)
d.shape
#output
(4, 4)

ndim(维度)

a.ndim
#output
1
d.ndim
#output
2

dtype(类型）
可以通过dtype来查看numpy数组元素的数据类型。

a.dtype
#output
dtype('<U2')
d.dtype
#output
dtype('int32')

指定数据类型
由于numpy会强制数据类型，因此，如果想指定数据类型的话可以这样操作。

 arr = np.array([1, 2.2, 3, 4.9],dtype = 'int32')
# output
 array([1, 2, 3, 4])
# 如果遇到无法转换，则会报错
 arr = np.array([1. , 2.2, 3. , 'a'],dtype = 'int32')
 ValueError: invalid literal for int() with base 10: 'a'

修改数据类型
numpy数据类型转换需要调用方法astype()，不能直接修改dtype。调用astype返回数据类型修改后的数据，但是源数据的类型不会变。

 arr = np.array([1 , 2.2, 3, 4.9])
 a = arr.astype(int)
# output
 array([1, 2, 3, 4])
 
 a = arr.astype(np.int64)
# output
 array([1, 2, 3, 4], dtype=int64)
 
 a = arr.astype(np.str)
# output
 array(['1.0', '2.2', '3.0', '4.9'], dtype='<U32')

itemsize(最大元素的字节数)

a.itemsize
#output
4

b.itemsize
#output
4

nbytes(总元素字节数）

 a.nbytes
# output
 16
 b.nbytes
# output
 16

fill(填充)

a.fill('a')
#output
array(['a', 'a', 'a', 'a'], dtype='<U1')

reshape(重塑)
在不改变原数据的情况下，重新按指定形状生成数组

>>> a = np.arange(1,26)
>>> a
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25])
# 元素不匹配时，则会报错
>>> a.reshape(5,6)
ValueError: cannot reshape array of size 25 into shape (5,6)
>>> a.reshape(5,5)
array([[ 1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15],
       [16, 17, 18, 19, 20],
       [21, 22, 23, 24, 25]])

sum(axis)
对于sum中有一个axis的参数，假设默认为None，现设a是一个形状为(2,3,2)的数组。则

axis=0(-3)时，则将a在第0维(倒数第3维)上求和，得到一个(3,2)的数组
axis=1(-2)时，则将a在第1维(倒数第2维)上求和，得到一个(2,2)的数组
axis=2(-1)时，则将a在第2维(倒数第1维)上求和，得到一个(2,3)的数组

 a = np.arange(12).reshape(2,3,2)
# output
 array([[[ 0,  1],
        [ 2,  3],
        [ 4,  5]],

       [[ 6,  7],
        [ 8,  9],
        [10, 11]]])
 a_0 = a.sum(axis = 0)
 a_1 = a.sum(axis = 1)
 a_2 = a.sum(axis = 2)
# output 分别为a_0, a_1, a_2
 array([[ 6,  8],
       [10, 12],
       [14, 16]])
 array([[ 6,  9],
       [24, 27]])
 array([[ 1,  5,  9],
       [13, 17, 21]])

keepdims
在sum函数中，参数keepdims默认是no value的。如果想让求和后的纬度保持不变，则可以通过设置keeodims为True来实现。

 b = a.sum(axis = 2,keepdims = True)
# output
 array([[[ 1],
        [ 5],
        [ 9]],

       [[13],
        [17],
        [21]]])
b.shape
# output
 (2, 3, 1)

initinal
通过initial可以设置初始值。

 a.sum()
# output
 66

a.sum(initial = 10)
# output
 76

切片

每当做切片时，numpy做的是给你一个相同内存缓冲区的一个试图。所以numpy大多数情况下尽量不创建一个副本，它不赋值数据，只是指向内存中的同一个位置，所以这意味着对一个巨大数组做切片操作很廉价。除了一些关于形状和维度数量的元数据外。

一维切片
numpy数组切片和python基本上是一样的。

vector=numpy.array([5,10,15,20])
print(vector[0:3])

多维切片
以二维数组为例，先在行切片，找到指定行后再列切片。你可以简单理解为在不同维度切片后重叠区域。

1 2 9 4 3
0 5 3 5 1
8 3 2 4 7

>>> b[1,3:5]
array([5, 1])

>>> b[0:2,0:2]
array([[1, 2],
       [0, 5]])

>>> b[:,2]
array([9, 3, 2])

>>> b[::2,1::2]
array([[2, 4],
       [3, 4]])

通过位置索引
在numpy的数组可以通过列表批量传入索引值来进行索引。

 a = np.arange(8)
 indices = [1,2,-2]
 b = a[indices]
# output
 array([1, 2, 6])

# 也可以直接传入行和列（对应位置组成点）
a = np.arange(36).reshape([6,6])
# putout
array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23],
       [24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35]])
 b = a[[0,1,2,3,4],[1,2,3,4,5]]
 # output
  array([ 1,  8, 15, 22, 29])

布尔索引
我们也可以通过布尔的真假值来进行索引。

 a = np.arange(10)
 b = a < 5
# output 返回的是一个bool数组
 array([True, True, True, True, True, False, False, False, False, False])

 a[a < 5] = 0
 print(a)
# output
 array([0, 0, 0, 0, 0, 5, 6, 7, 8, 9])
 
# 注意这里的bool个数应该和a对等
 c = np.array([0,1,0,0,0,1,0,1,0,1],dtype = bool)
 a[c]
# output
 array([1, 5, 7, 9])

# 值得注意的是，只有0才为False
In [1]: c = np.array([1,2,3,4,0,0,-5],dtype = bool)
Out[2]: array([ True,  True,  True,  True, False, False,  True])