数组的创建+数据类型+数组的形状+数组的计算+numpy常用操作（索引和切片+转置+数值修改+nan+数学运算+nan值填充+数组拼接和交换）+yutube视频案例分析

import numpy as np

#使用numpy生成数组，得到ndarray的类型
t1 = np.array([1, 2, 3, ])
print(t1)
#[1 2 3]
print(type(t1))
#<class 'numpy.ndarray'>


t2 = np.array(range(10))
print(t2)
#[0 1 2 3 4 5 6 7 8 9]

t3 = np.arange(10)
print(t3)
#[0 1 2 3 4 5 6 7 8 9]

t4 = np.arange(4, 10, 2)
print(t4)
#[4 6 8]

2、数组的类型

（1）判断数组的数据类型

 t3 = np.arange(10)
print(t3)
#[0 1 2 3 4 5 6 7 8 9]


print(t3.dtype)
#int32
这是一种特殊的类型，只有在numpy中会体现位数

（2）指定数据类型

#指定数据类型
t5 = np.array(range(1, 4), dtype=float)
print(t5)
#[1. 2. 3.]
print(t5.dtype)
#float64

t6 = np.array([1, 0, 1, 1, 0, 1, 1, 1, 0, 0], dtype=bool)
print(t6)
print(t6.dtype)
'''
[ True False  True  True False  True  True  True False False]
bool
'''

（3）调整数据类型

#调整数据类型
t7 = t6.astype('int8')
print(t7)
#[1 0 1 1 0 1 1 1 0 0]

（4）随机生成数

import numpy as np
import random
#numpy中的小数


t8 = np.array([random.random() for i in range(10)])
#生成10个小数
print(t8)
'''
[0.18085992 0.13684474 0.09817757 0.0194122  0.50201785 0.36771782
 0.81244784 0.13172148 0.83574047 0.96088854]
'''
print(t8.dtype)
#float64

t9 = np.round(t8, 2)
#对t8取两位小数
print(t9)
#[0.27 0.19 0.99 0.97 0.83 0.89 0.09 0.94 0.75 0.97]

print(round(random.random(), 3))
#0.237 保留随机数的三位小数

3、数组的形状

（1）一维数组

import numpy as np

t1 = np.arange(12)
print(t1)
#[ 0  1  2  3  4  5  6  7  8  9 10 11]
print(t1.shape)
#(12,) t1是一维数组 元组中只有一个值，我们将其视作为一维数组

（2）二维数组

t2 = np.array([[1, 2, 3], [4, 5, 6]])
print(t2)
'''
[[1 2 3]
 [4 5 6]]
'''
print(t2.shape)
#(2, 3) 两行三列的二维数组

（3）三维数组

t3 = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
print(t3)
'''
[[[ 1  2  3]
  [ 4  5  6]]

 [[ 7  8  9]
  [10 11 12]]]
'''
print(t3.shape)
#(2, 2, 3) 是一个三维数组

第一个数2表示的是空间数，有两个空间，第二个数表示的是每个空间里的二维数组的行数，第三个数表示的是列

（4）改变数组的形状

数组.reshape（（））在reshape方法中传入数组

t4 = np.arange(12)
t4.reshape((3, 4))
#将12个数分解成3行4列的二维数组
print(t4.reshape((3, 4)))
'''
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]

'''

t5 = np.arange(24).reshape((2, 3, 4))
print(t5)
#第一个数指的是块数，后面两个数是每块里面的二维数组是几行几列的
'''
[[[ 0  1  2  3]
  [ 4  5  6  7]
  [ 8  9 10 11]]

 [[12 13 14 15]
  [16 17 18 19]
  [20 21 22 23]]]

'''

print(t5.reshape((4, 6)))
#t5本身并没有发生变化，reshape是有一个return返回值的
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]]
'''

print(t5.reshape((24, )))
#[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]


print(t5.reshape((24, 1)))
#是一个二维数组，24行1列
'''
[[ 0]
 [ 1]
 [ 2]
 [ 3]
 [ 4]
 [ 5]
 [ 6]
 [ 7]
 [ 8]
 [ 9]
 [10]
 [11]
 [12]
 [13]
 [14]
 [15]
 [16]
 [17]
 [18]
 [19]
 [20]
 [21]
 [22]
 [23]]

'''

print(t5.reshape(1, 24))
#是一个二维数组，一行24列
#[[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]]

'''
t6 = t5.reshape((t5.shape[0] * t5.shape[1], ))
                   #shape[0]是行，shape[2]是列，行 * 列 就是t5所包含的数的个数

print(t6)
'''

print(t5.flatten())
#[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23]
#将t5按照行展开成一维数组

4、数组的计算

（1）数组加减乘除一个常数

import numpy as np

t5 = np.arange(24).reshape(4, 6)
print(t5)
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]]
'''
print('#######################################')
print(t5 + 2)
'''
[[ 2  3  4  5  6  7]
 [ 8  9 10 11 12 13]
 [14 15 16 17 18 19]
 [20 21 22 23 24 25]]
'''
print('#######################################')
print(t5 * 2)
'''
[[ 0  2  4  6  8 10]
 [12 14 16 18 20 22]
 [24 26 28 30 32 34]
 [36 38 40 42 44 46]]
'''
print('#######################################')
print(t5 / 2)
'''
[[ 0.   0.5  1.   1.5  2.   2.5]
 [ 3.   3.5  4.   4.5  5.   5.5]
 [ 6.   6.5  7.   7.5  8.   8.5]
 [ 9.   9.5 10.  10.5 11.  11.5]]

（2）两个数组之间的计算


import numpy as np

t5 = np.arange(24).reshape(4, 6)
print(t5)
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]]
'''

import numpy as np

t6 = np.arange(100， 124).reshape(4, 6)
print(t6)
'''
[[100 101 102 103 104 105]
 [106 107 108 109 110 111]
 [112 113 114 115 116 117]
 [118 119 120 121 122 123]]
'''

print(t5 + t6)
'''
[[100 102 104 106 108 110]
 [112 114 116 118 120 122]
 [124 126 128 130 132 134]
 [136 138 140 142 144 146]]
'''

广播原则

如果两个数的后缘维度（即从末尾开始算起的维度）的轴长度相符或者其中一方的长度为1则认为它们是广播兼容的、广播会在确实和（或）长度为1的维度上进行

在二维数组中，行或者列为1时，另一个方向上要与所计算的另一个数组的列或行相同时，就可以计算；此外，当两个数组的形状完全相同时才可以计算。

t7 = np.arange(0, 6)
print(t7)
#[0 1 2 3 4 5]

print(t5 - t7)
'''
[[ 0  0  0  0  0  0]
 [ 6  6  6  6  6  6]
 [12 12 12 12 12 12]
 [18 18 18 18 18 18]]
'''

t8 = np.arange(0, 4).reshape(4, 1)
print(t8)
'''
[[0]
 [1]
 [2]
 [3]]

'''

print(t5 - t8)
#t5的每一列都减掉t8
'''
[[ 0  1  2  3  4  5]
 [ 5  6  7  8  9 10]
 [10 11 12 13 14 15]
 [15 16 17 18 19 20]]
'''

在三维数组中，在任何一个方向上相同，两个数组即可进行计算

三、轴

在numpy中可以理解为方向,使用0,1,2...数字表示,对于一个一维数组,只有一个0轴,对于2维数组(shape(2,2)),有0轴和1轴,对于三维数组(shape(2,2, 3)),有0,1,2轴

二维数组的轴

三维数组的轴

四、numpy的常用操作

1、索引和切片

import numpy as np
from matplotlib import pyplot as plt

us_file_path = r"D:\developer_tools\python\机器学习库\numpy\youtube_videos_data\us_videos_data_numbers.CSV"
uk_file_path = r"D:\developer_tools\python\机器学习库\numpy\youtube_videos_data\gb_videos_data_numbers.csv"
#print(os.getcwd())

t1 = np.loadtxt(us_file_path, delimiter=",", dtype="int")
t2 = np.loadtxt(uk_file_path, delimiter=",", dtype="int")


取行
print(t1[2])
#[5845909  576597   39774  170708] 取了第三条数据


取连续的多行
print(t1[2:])   #从第二行开始取
'''
[[5845909  576597   39774  170708]
 [2642103   24975    4542   12829]
 [1168130   96666     568    6666]
 ...
 [ 142908    7088      68     437]
 [  24532    2148      77       0]
 [ 144039    1574      59       0]]

'''

取不连续的多行
print(t2[[2, 8, 10], :])

取行
print(t1[1, :])  #取第二行，列都要
print(t1[2:, :]) #取第三行，列都要
print(t2[[1, 8, 10], :])

print("*" * 100)
取列,取第三列及之后的所有列
print(t2[:, 2:])
'''
[[13548   705]
 [ 1309     0]
 [  151  1141]
 ...
 [   88   336]
 [    8    59]
 [   19   173]]
'''
取第三行，第四列
#a = t2[2, 3]
#print(a)
#1141 是一个具体的值
#print(type(a))
#<class 'numpy.int32'>


#取多行和多列，取第三行到第五行，第二列到第四列的结果
#b = t2[2:5, 1:4]
#print(b)
'''
[[13119   151  1141]
 [65729  1529  3598]
 [ 5019    57   490]]
'''

取多个不相邻的点
c = t2[[0, 2], [0, 1]]  
这里的取值很特别，行的值放在一个列表里，列的值放在一个列表里，相应位置的值对应
及取（0,0）和（2,1）位置的值
取第1行，第1列的数值
取第3行，第3列的数值
print(c)
#[1231231   13119]


#取的是[0, 0] [2,1] [2,3] 这三个点
#d = t2[[0, 2, 2], [0, 1, 3]]
#[1231231   13119]

2、数组的转置运算

transpose（）函数；

T；

swapaxes（1,0）——将行和列进行交换

import numpy as np

t1 = np.arange(24).reshape(4, 6)

print(t1.transpose())
'''
[[ 0  6 12 18]
 [ 1  7 13 19]
 [ 2  8 14 20]
 [ 3  9 15 21]
 [ 4 10 16 22]
 [ 5 11 17 23]]

'''

print(t1.T)

print(t1.swapaxes(1, 0))
#把行和列进行转置

3、numpy中数值的修改


import numpy as np
t = np.arange(24).reshape(4, 6)
print(t)
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]]
'''

t[:, 2:4] = 0 
将数组中第3-5列的值改为0
print(t)
'''
[[ 0  1  0  0  4  5]
 [ 6  7  0  0 10 11]
 [12 13  0  0 16 17]
 [18 19  0  0 22 23]]
'''

print(t < 10)
布尔索引 
'''
[[ True  True  True  True  True  True]
 [ True  True  True  True False False]
 [False False False False False False]
 [False False False False False False]]
'''

t[t < 10] = 3
将小于10的数值都改为3
print(t)
'''
[[ 3  3  3  3  3  3]
 [ 3  3  3  3 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]]
'''

a = t[t > 20]
print(a)
#[21 22 23]  值只取出了比20大的数值， 但是在替换的时候是在这三个数对应的位置进行替换

t[t > 20] = 20
print(t)
'''
[[ 3  3  3  3  3  3]
 [ 3  3  3  3 10 11]
 [12 13 14 15 16 17]
 [18 19 20 20 20 20]]
'''

t = np.where(t < 10, 0, 10)
#将小于10的数替换为0，其他数值替换成10
#类似于python中的三元组形式 a=3 if 3>2 else 4
print(t)
'''
[[ 0  0  0  0  0  0]
 [ 0  0  0  0 10 10]
 [10 10  0  0 10 10]
 [10 10  0  0 10 10]]
'''


t2 = np.arange(24).reshape(4, 6)
print(t2)

print("*" * 100)
t2 = t2.clip(3, 18)
将比三小的数都变为3，比18大的数都变为18
print(t2)

4、numpy中的nan值

*nan：not a number表示不是一个数字

当我们读取本地的文件为float时，如果有缺失，就会出现nan，当做了一个不合适的计算的时候也会出现nan（比如无穷大inf减去无穷大）

nan的数据类型是float

#nan和常用方法
t2 = np.arange(24).reshape(4, 6)
print(t2)


t2 = t2.astype(float)
print(t2)


t2[3, 3] = np.nan
print(t2)

t2[:, 0] = 0 
print(t2)
'''
[[ 3.  0.  3.  3.  4.  5.]
 [ 6.  0.  8.  9. 10. 11.]
 [12.  0. 14. 15. 16. 17.]
 [18.  0. 18. nan 18. 18.]]
'''
a = np.count_nonzero(t2)
print(a)
#20
返回不为0的数的个数

*特殊属性

（1）两个nan是不相等的；

（2）np.nan != np.nan;

（3）利用以上特性，判断数组中nan的个数；

#统计nan的个数
print(t2 != t2)
'''
[[False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False  True False False]]
'''

b = np.count_nonzero(t2 != t2)
print(b)
#1 统计的是nan的个数，即返回的True的个数

（4）通过np.isnan（t）来判断nan有几个；

c = np.isnan(t2)
print(c)
'''
[[False False False False False False]
 [False False False False False False]
 [False False False False False False]
 [False False False  True False False]]
'''

d = np.count_nonzero(np.isnan(t2))
print(d)
#1

（5）nan跟任何一个数进行计算都是nan

*数组中的数学运算

求和：t.sum(axis=None)

均值：t.mean(axis=None) 受离群点的影响较大

中值：np.median(t,axis=None)

最大值：t.max(axis=None)

最小值：t.min(axis=None)

极值：np.ptp(t,axis=None) 即最大值和最小值只差

标准差：t.std(axis=None)

t3 = np.arange(12).reshape(3, 4)
print(t3)
'''
[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]]
'''

a2 = np.sum(t3)
print(a2)
#66

b1 = np.sum(t3, axis=0) #以列为单位，将每行数值进行相加计算
print(b1)
#[12 15 18 21]

b2 = np.sum(t3, axis=1) #以行为单位，将每列数值进行相加计算
print(b2)
#[ 6 22 38]

t2 = t2[t2 == t2]
print(t2)
#[ 0.  3.  3.  3.  4.  5.  0.  7.  8.  9. 10. 11.  0. 13. 14. 15. 16. 17. 0. 18. 18. 18. 18.]
#即将不为nan的数作为一组数列返回

*在一组数据中单纯将nan替换为0并不合适，很多时候会把缺失值替换成均值（中值）或者直接删除有缺失值的一行。

import numpy as np
t1 = np.arange(12).reshape(3, 4)
t1 = t1.astype("float")
要赋值nan值必须将数据类型转换为float
t1[1, 2:] = np.nan
将第二行，第三列到第四列的数值转换为nan值
print(t1)
'''
[[ 0.  1.  2.  3.]
 [ 4.  5. nan nan]
 [ 8.  9. 10. 11.]]
'''

def fill_ndarray(t1):
    for i in range(t1.shape[1]):  #遍历t1中所有的列
        temp = t1[:, i]   #对于当前这一列来说
        nan_num = np.count_nonzero(temp != temp) #计算这一列中的nan中的值
        if nan_num != 0:  #如果这一列里面有nan值
            temp_no_nan = temp[temp == temp]   #将这一列中没有nan的剩余数值以列表形式返回
            temp[np.isnan(temp)] = temp_no_nan.mean()  #将这一列中有nan的地方更新为数组的均值

    return t1

if __name__ == "__main__":
    t1 = np.arange(12).reshape(3, 4).astype("float")
    t1[1, 2:] = np.nan
    print(t1)
    t1 = fill_ndarray(t1)
    print(t1)

'''
[[ 0.  1.  2.  3.]
 [ 4.  5. nan nan]
 [ 8.  9. 10. 11.]]
 
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]
'''

*数组的拼接和交换

#数组的拼接
import numpy as np
t1 = np.arange(12).reshape(2, 6)
print(t1)
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]]
'''

t2 = np.arange(12, 24).reshape(2, 6)
print(t2)
'''
[[12 13 14 15 16 17]
 [18 19 20 21 22 23]]
'''

#竖直拼接
t3 = np.vstack((t1, t2))   #这里必须是以元组的形式传入数据
print(t3)
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]]

'''

#水平拼接
t4 = np.hstack((t1, t2))
print(t4)
'''
[[ 0  1  2  3  4  5 12 13 14 15 16 17]
 [ 6  7  8  9 10 11 18 19 20 21 22 23]]

'''

#在进行数据拼接时，要将量两个列表中含义相同的列拼接在一起，有时候需要进行数列的交换
#数据的交换
t1[[0, 1], :] = t1[[1, 0], :]
#把t1的前两行进行交换
print(t1)
'''
[[ 6  7  8  9 10 11]
 [ 0  1  2  3  4  5]]

'''

t1[:, [1, 2]] = t1[:, [2, 1]]
#将t1列表中第2/3列进行交换
print(t1)
'''
[[ 6  8  7  9 10 11]
 [ 0  2  1  3  4  5]]
'''

*numpy中的其他方法

创建全部为1的矩阵

a = np.ones((2, 3))  #创建两行三列的全为1的矩阵
print(a)
'''
[[1. 1. 1.]
 [1. 1. 1.]]

创建全部为0的矩阵

b = np.zeros((3, 4))
print(b)
'''
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
'''

创建对角线矩阵

#创建一个对角线为1的正方形矩阵
c = np.eye(3)
print(c)
'''
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
'''

返回最大值或最小值的位置

d = np.argmax(c, axis=0)  #计算行方向上的最大值，并返回其在这一行的位置
print(d)
#[0 1 2]

c[c == 1] = -1
print(c)
'''
[[-1.  0.  0.]
 [ 0. -1.  0.]
 [ 0.  0. -1.]]
'''

e = np.argmin(c, axis=1)
print(e)
#[0 1 2]

*生成随机数矩阵

f = np.random.randint(10, 20, (4, 5))
#每次生成的随机数组都不相同
#生成4行5列的数组，每个数的范围是10-20
print(f)
'''
[[14 15 11 10 12]
 [18 13 19 12 13]
 [16 11 13 18 12]
 [19 16 12 12 17]]

'''


#设置一个随机数种子，s是给定的种子值，每次都会生成相同的随机数
np.random.seed(10)
t = np.random.randint(0, 20, (4, 5))
print(t)
'''
[[ 9  4 15  0 17]
 [16 17  8  9  0]
 [10  8  4 19 16]
 [ 4 15 11 11  1]]
'''

六、numpy+matplotlib进行绘图数据分析——youtube视频案例分析

csv：逗号分割值文件

1、取美国视频数据的最后一列——评论数,绘制美国视频评论数的直方图

在第一次绘制时，前几组数据较多，后面的偏少，几乎为零，大部分的数值集中在5000条评论

import numpy as np
from matplotlib import pyplot as plt

us_file_path = r"D:\developer_tools\python\机器学习库\numpy\youtube_videos_data\us_videos_data_numbers.CSV"
uk_file_path = r"D:\developer_tools\python\机器学习库\numpy\youtube_videos_data\gb_videos_data_numbers.csv"
#print(os.getcwd())

t1 = np.loadtxt(us_file_path, delimiter=",", dtype="int")
t2 = np.loadtxt(uk_file_path, delimiter=",", dtype="int")

#取最后一列——评论数,绘制美国视频评论数的直方图
t_us_comment = t_us_comment[t_us_comment <= 5000]
t_us_comment = t1[:, -1]
print(t_us_comment.max(), t_us_comment.min())
#4996 0

#由于直方图分布不够均匀，我们将太大的评论数去除

d = 250
num_bins = (t_us_comment.max() - t_us_comment.min()) // d

plt.figure(figsize=(20, 8), dpi=80)
plt.hist(t_us_comment, num_bins)


plt.show()

2、了解英国视频中喜欢和评论数之间的关系

在经过第一个绘制图像之后，我们发现大部分的点分布在喜欢数量为<=500000的部分

import numpy as np
import os
from matplotlib import pyplot as plt


us_file_path = r"D:\developer_tools\python\机器学习库\numpy\youtube_videos_data\us_videos_data_numbers.CSV"
uk_file_path = r"D:\developer_tools\python\机器学习库\numpy\youtube_videos_data\gb_videos_data_numbers.csv"
#print(os.getcwd())

t1 = np.loadtxt(us_file_path, delimiter=",", dtype="int")
t2 = np.loadtxt(uk_file_path, delimiter=",", dtype="int")

#希望了解英国视频中喜欢和评论数之间的关系
t_uk = t2[t2[:, 1] <= 500000]
t_uk_comment = t_uk[:, -1]
t_uk_like = t_uk[:, 1]

plt.figure(figsize=(20, 8), dpi=80)
plt.scatter(t_uk_like, t_uk_comment)

plt.show()

其实还可以再取评论数小于等于200000部分的散点图，可以发现喜欢和评论之间的更具体的关系

3、将美国和英国的视频数据拼接在一起，保留国家信息，将美国设置为0，英国设置为1，求评论数的均值

import numpy as np

us_file_path = r"D:\developer_tools\python\机器学习库\numpy\youtube_videos_data\us_videos_data_numbers.CSV"
uk_file_path = r"D:\developer_tools\python\机器学习库\numpy\youtube_videos_data\gb_videos_data_numbers.csv"

#加载国家数据
us_data = np.loadtxt(us_file_path, delimiter=",", dtype=int)
uk_data = np.loadtxt(uk_file_path, delimiter=",", dtype=int)


#添加国家信息
#构造全为零的数据
zero_data = np.zeros((us_data.shape[0], 1)).astype(int)
            #构造全为零的列表，形状是跟us数据中的行数一样，一列数组，值全部为0

ones_data = np.ones((uk_data.shape[0], 1)).astype(int)
#构造全部为1的列表，行数跟uk数据中的行一样，也是构建一列

#分别添加国家信息，美国为0，英国为1
us_data = np.hstack((us_data, zero_data))
uk_data = np.hstack((uk_data, ones_data))


#然后再进行竖直拼接
final_data = np.vstack((us_data, uk_data))
print(final_data)

print(final_data[:, -2].mean())
#4563.075946390783
#计算所有评论数的平均值

斯外戈的小白

关注

1
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫