数据分析第五讲 numpy

最新推荐文章于 2021-06-17 20:40:04 发布

加油小羽哥

最新推荐文章于 2021-06-17 20:40:04 发布

阅读量3.5w

点赞数 3

本文链接：https://blog.csdn.net/yangyusir/article/details/115049254

版权

数据分析专栏收录该内容

7 篇文章 5 订阅

订阅专栏

文章目录

数据分析第五讲 numpy + pandas

数据分析第五讲 numpy + pandas

在这里插入图片描述

一、numpy中的数组操作

1、numpy中数值的修改

t = np.arange(20).reshape(4,5)
t[:,0:2] = 0

# numpy中数值的修改
import numpy as np

t = np.arange(20)
print(t)
'[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19]'
t1 = t.reshape(4, 5)
print(t1)
''' 
[ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]'''
t1[:, 2] = 0  # 修改第3列的值为0
print(t1)
'''
[[ 0  1  0  3  4]
 [ 5  6  0  8  9]
 [10 11  0 13 14]
 [15 16  0 18 19]]'''
t1[:, 0:2] = 0  # 修改第1列、第2列的值为0
print(t1)
'''
[[ 0  0  0  3  4]
 [ 0  0  0  8  9]
 [ 0  0  0 13 14]
 [ 0  0  0 18 19]]'''

2、numpy中的布尔索引

# numpy中的布尔索引
import numpy as np
t2 = np.arange(24).reshape(4, 6)
print(t2 < 10)
'''
[[ True  True  True  True  True  True]
 [ True  True  True  True False False]
 [False False False False False False]
 [False False False False False False]]
'''
print(t2[t2 < 10])
'''[0 1 2 3 4 5 6 7 8 9]'''
t2[t2 > 20] = 0
print(t2)
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20  0  0  0]]'''

3、numpy中的三元运算符

如何把t中小于10的数字替换成0,大于20的替换成20
np.where(t<10,0,20)

# numpy中的三元运算符
import numpy as np

t = np.arange(24).reshape(4, 6)
# 小于10的替换为0，大于10的替换为10
t1 = np.where(t < 10, 0, 20)
print(t1)
'''
[[ 0  0  0  0  0  0]
 [ 0  0  0  0 20 20]
 [20 20 20 20 20 20]
 [20 20 20 20 20 20]]'''

4、numpy中的clip

将小于等于10的替换成10，将大于18的替换成18
t.clip(10,18)

# numpy中的clip
import numpy as np

t = np.arange(24).reshape(4, 6)
# 小于10的替换为10，大于18的替换为18
t2 = t.clip(10,18)
print(t2)
'''
[[10 10 10 10 10 10]
 [10 10 10 10 10 11]
 [12 13 14 15 16 17]
 [18 18 18 18 18 18]]'''

二、numpy中的nan的注意点

1.numpy中的nan和inf

nan(NAN,Nan):not a number表示不是一个数字
inf(-inf,inf):infinity,inf表示正无穷，-inf表示负无穷

# numpy中的nan的注意点
import numpy as np

t = np.arange(24).reshape(4, 6)
# t[3,3] = np.nan
# print(t)  # ValueError: cannot convert float NaN to integer  整数不能直接转为nan，必须是float类型
t = t.astype("float")
t[3,3] = np.nan
print(t)
'''
[[ 0.  1.  2.  3.  4.  5.]
 [ 6.  7.  8.  9. 10. 11.]
 [12. 13. 14. 15. 16. 17.]
 [18. 19. 20. nan 22. 23.]]'''
# 如果要把整列都变成nan需要把整列取出来，然后for循环变成nan.

2.numpy中的nan的注意点

1.两个nan是不相等的
2.判断数组中nan的个数
3.nan和任何值计算都为nan
在这里插入图片描述

# numpy中的nan的注意点
import numpy as np

t = np.arange(6).reshape(2,3)
print(t)
'''
[[0 1 2]
 [3 4 5]]'''
print(np.count_nonzero(t))  # 5
t = t.astype('float')
print(t)
'''
[[0. 1. 2.]
 [3. 4. 5.]]'''
t[1,2] = np.nan
print(t)
'''
[[ 0.  1.  2.]
 [ 3.  4. nan]]'''
print(t != t)
'''
[[False False False]
 [False False  True]]'''    # False == 0    True == 1
print(np.count_nonzero(t != t))  # 1 就是 nan的个数
print(np.isnan(t))
'''
[[False False False]
 [False False  True]]'''
print(np.count_nonzero(np.isnan(t)))  # 1 就是 nan的个数
print(np.isnan(t))
'''
[[False False False]
 [False False  True]]'''
print(np.count_nonzero(np.isnan(t)))  # 1 就是 nan的个数
print(t + 1)  # nan和任何值计算都为nan
'''
[[ 1.  2.  3.]
 [ 4.  5. nan]]'''
t2 = np.arange(6).reshape(2,3)
print(t2)
'''
[[0 1 2]
 [3 4 5]]'''
print((t+1) * t2)  # nan和任何值计算都为nan
'''
[[ 0.  2.  6.]
 [12. 20. nan]]'''
print(t.sum())  # nan

三、numpy常用的方法

numpy常用的方法

1.求和：t.sum(axis=None)

2.均值：t.mean(a,axis=None)

3.中值：np.median(t,axis=None)

4.最大值：t.max(axis=None)

5.最小值：t.min(axis=None)

6.极差：np.ptp(t,axis=None)

7.标准差：t.std(axis=None)

# numpy常用的方法
import numpy as np

# 求和：t.sum(axis=None)
t = np.arange(6).reshape(2, 3)
print(t)
'''
[[0 1 2]
 [3 4 5]]'''
print(t.sum())  # 15
print(t.sum(axis=0))  # 列相加
'''[3 5 7]'''
print(t.sum(axis=1))  # 行相加
'''[ 3 12]'''
# 均值：t.mean(a,axis=None)
print(t.mean(axis=0))  # 每列的均值
'''[1.5 2.5 3.5]'''
print(t.mean(axis=1))  # 每行的均值
'''[1. 4.]'''
# 中值：np.median(t,axis=None)
print(np.median(t))  # t中所有数的中值  2.5  中间两个数2、3之和除2
print(np.median(t, axis=0))  # t中所有列的中值 [1.5 2.5 3.5]   0、3之和除2  1、4之和除2   2、5之和除2
print(np.median(t, axis=1))  # t中所有行的中值 [1. 4.]
# 最大值：t.max(axis=None)
print(t.max())  # 5  t中所有数的最大值
print(t.max(axis=0))  # [3 4 5]  t中所有列的最大值
print(t.max(axis=1))  # [2 5]  t中所有行的最大值
# 最小值：t.min(axis=None)
print(t.min())  # 0  t中所有数的最小值
print(t.min(axis=0))  # [0 1 2]  t中所有列的最小值
print(t.min(axis=1))  # [0 3]  t中所有行的最小值
# 极差 最大值减最小值  np.ptp(t)
print(np.ptp(t))  # 5   5-0
print(np.ptp(t, axis=0))  # [3 3 3]
print(np.ptp(t, axis=1))  # [2 2]
# 标准差：t.std(axis=None)  一组数据分散程度的度量，波动幅度
print(t.std())  # 1.707825127659933
print(t.std(axis=0))  # [1.5 1.5 1.5]
print(t.std(axis=1))  # [0.81649658 0.81649658]

四、numpy中填充nan

# numpy中填充nan
import numpy as np

# 求和：t.sum(axis=None)
t = np.arange(12).reshape(3,4).astype('float')
print(t)
'''
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]'''
t[1,2:] = np.nan
print(t)
'''
[[ 0.  1.  2.  3.]
 [ 4.  5. nan nan]
 [ 8.  9. 10. 11.]]'''
# nan替换为中值

def fill_ndarray():
    for i in range(t.shape[1]):
        # 取出当前的一列
        t_col = t[:,i]
        # print(t_col)
        '''
        [0. 4. 8.]
        [1. 5. 9.]
        [ 2. nan 10.]
        [ 3. nan 11.]'''
        nan_num = np.count_nonzero(t_col != t_col)
        # print(nan_num)
        # 不为0 说明这一列有nan
        if nan_num != 0:
            not_nan_col = t_col[t_col == t_col]  # 取出这一列不为nan的数
            # print(not_nan_col)
            '''[ 2. 10.]
               [ 3. 11.]'''
            not_mean = not_nan_col.mean()  # 求平均值
            t_col[t_col != t_col] = not_mean  # nan替换为平均值
    return t


if __name__ == '__main__':
    t1 = fill_ndarray()
    print(t1)
'''
[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]'''

五、总结与练习

1.如何选择一行或者多行的数据(列)

2.如何给选取的行或者列赋值

3.如何把大于10的值替换为10

4.np.where如何使用？

5.np.clip如何使用?

6.如何转置(交换轴)?

7.读取数据为csv

8.np.nan和np.inf是什么？

9.常用的统计函数

'''
1.如何选择一行或者多行的数据(列)
索引和切片
t[2]         # 第3行
t[[1,2]]     # 连续多行  第2行和第3行
t[行，列]
2.如何给选取的行或者列赋值
t[2] = 0
3.如何把大于10的值替换为10
t[t>10] = 10
4.np.where如何使用？
np.where(t>8,20,0)    大于8的替换为20  其他替换为0
5.np.clip如何使用?
t.clip(10,20)       把<10的替换为10，>20的替换为20
6.如何转置(交换轴)?
t.T
t.transpose
t.swapaxes(1,0)
7.读取数据为csv
np.loadtxt()  fname  分隔符","  dtype=int
8.np.nan和np.inf是什么？
nan   not a number   不是一个数字
inf 表示无穷  inf  -inf
9.常用的统计函数
求和：t.sum(axis=None)
均值：t.mean(a,axis=None)
中值：np.median(t,axis=None)
最大值：t.max(axis=None)
最小值：t.min(axis=None)
极差：np.ptp(t,axis=None)
标准差：t.std(axis=None)
'''
import numpy as np

t = np.arange(24).reshape(4,6)
print(t)
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]]'''
print(t[2])  # 第3行  '''[12 13 14 15 16 17]'''
print(t[[1,2]])  # 连续多行  第2行和第3行
'''
[[ 6  7  8  9 10 11]
 [12 13 14 15 16 17]]'''
print(t[3,5])  # 23 第4行第六列
t[2]=0
print(t)
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [ 0  0  0  0  0  0]
 [18 19 20 21 22 23]]
'''
t[t>10]=10
print(t)
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 10]
 [ 0  0  0  0  0  0]
 [10 10 10 10 10 10]]
'''
t = np.where(t>8,20,0)
print(t)
'''
[[ 0  0  0  0  0  0]
 [ 0  0  0 20 20 20]
 [ 0  0  0  0  0  0]
 [20 20 20 20 20 20]]'''
print("="*30)
t = np.arange(24).reshape(4,6)
print(t)
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]]'''
t = t.clip(10,20)
print(t)
'''
[[10 10 10 10 10 10]
 [10 10 10 10 10 11]
 [12 13 14 15 16 17]
 [18 19 20 20 20 20]]'''
print(t.T)
'''
[[10 10 12 18]
 [10 10 13 19]
 [10 10 14 20]
 [10 10 15 20]
 [10 10 16 20]
 [10 11 17 20]]'''
print(t.transpose)
'''
[[10 10 12 18]
 [10 10 13 19]
 [10 10 14 20]
 [10 10 15 20]
 [10 10 16 20]
 [10 11 17 20]]'''
print(t.swapaxes(1,0))
'''
[[10 10 12 18]
 [10 10 13 19]
 [10 10 14 20]
 [10 10 15 20]
 [10 10 16 20]
 [10 11 17 20]]'''
# np.loadtxt()
'''
def loadtxt(fname, dtype=float, comments='#', delimiter=None,
            converters=None, skiprows=0, usecols=None, unpack=False,
            ndmin=0, encoding='bytes', max_rows=None):'''

练习
1.英国和美国各自youtube的数据结合之前的matplotlib绘制出各自的评论数量的直方图

# 1.英国和美国各自youtube的数据结合之前的matplotlib绘制出各自的评论数量的直方图
import numpy as np
from matplotlib import pyplot as plt

# 点击  喜欢  不喜欢  评论

# 读取数据
t_us = np.loadtxt("US_video_data_numbers.csv", delimiter=',',dtype=int)
# 取出评论数
# print(t_us)
t_us_comment = t_us[:,-1]  # t_us_comment = t_us[:,3]也可以
t_us_comment = t_us_comment[t_us_comment<5000]
# print(t_us_comment)

# 设置图片大小
plt.figure(figsize=(15,8), dpi=80)

d = 250
# 组数 = 极值/组距
bins_num_us = (max(t_us_comment) - min(t_us_comment))//d
# print(bins_num_us)  # 23

# 数据 数组
plt.hist(t_us_comment,bins_num_us)
plt.grid()
plt.show()

在这里插入图片描述
t_us = np.loadtxt(“US_video_data_numbers.csv”, delimiter=’,’,dtype=int)改为t_us = np.loadtxt(“GB_video_data_numbers.csv”, delimiter=’,’,dtype=int)

2.希望了解英国的youtube中视频的评论数和喜欢数的关系，应该如何绘制改图

# 2.希望了解英国的youtube中视频的评论数和喜欢数的关系，应该如何绘制改图
import numpy as np
from matplotlib import pyplot as plt

# 点击  喜欢  不喜欢  评论

# 读取数据
t_gb = np.loadtxt("GB_video_data_numbers.csv", delimiter=',',dtype=int)
t_gb = t_gb[t_gb[:,1]<300000]
print(t_gb)
# 取出评论数和喜欢数
# print(t_us)
t_gb_comment = t_gb[:,-1]  # t_us_comment = t_us[:,3]也可以
t_gb_likes = t_gb[:,1]

# 设置图片大小
plt.figure(figsize=(15,8), dpi=80)

# 绘散点图
plt.scatter(t_gb_comment,t_gb_likes)

plt.show()

在这里插入图片描述

六、数组的拼接

1.数组的拼接

我们希望把之前案例中两个国家的数据放在一起来研究分析,那么应该怎做？

# 我们希望把之前案例中两个国家的数据放在一起来研究分析,那么应该怎做？
import numpy as np
import csv

t_us = np.loadtxt("US_video_data_numbers.csv", delimiter=',',dtype=int)
t_gb = np.loadtxt("GB_video_data_numbers.csv", delimiter=',',dtype=int)
# print(t_us)
# 添加国家信息
zeros_data = np.zeros((t_us.shape[0],1)).astype(int)
# print(zeros_data)
ones_data = np.ones((t_gb.shape[0],1)).astype(int)
# print(ones_data)
us_final_data = np.hstack((zeros_data,t_us))
# print(us_final_data.shape[0],1)  # 1688 1
# print(us_final_data)
gb_final_data = np.hstack((ones_data,t_gb))
# print(gb_final_data.shape[0],1)  # 1600 1   再给gb_final_data垂直拼接88行1
ones_88 = np.ones((88,5)).astype(int)
# print(ones_88)
gb_final_data = np.vstack((gb_final_data,ones_88))
# print(gb_final_data)
final_data = np.hstack((us_final_data,gb_final_data))
print(final_data)
with open('us_gb_youtube_data.csv', 'w', newline='') as data_csv:
    csv_writer = csv.writer(data_csv, dialect='excel')
    csv_writer.writerows(final_data)

在这里插入图片描述
np.vstack() 垂直拼接
np.hstack() 水平拼接

# 数组的拼接
import numpy as np
t1 = np.arange(12).reshape(2,6)
t2 = np.arange(12,24).reshape(2,6)
print(t1)
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]]'''
print(t2)
'''
[[12 13 14 15 16 17]
 [18 19 20 21 22 23]]'''
# 垂直拼接
print(np.vstack((t1,t2)))
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]]'''

# 水平拼接
print(np.hstack((t1,t2)))
'''
[[ 0  1  2  3  4  5 12 13 14 15 16 17]
 [ 6  7  8  9 10 11 18 19 20 21 22 23]]'''

# 数组的拼接
import numpy as np
t1 = np.arange(18).reshape(3,6)
t2 = np.arange(12,24).reshape(2,6)
print(t1)
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]]'''
print(t2)
'''
[[12 13 14 15 16 17]
 [18 19 20 21 22 23]]'''
# 垂直拼接   列数必须相同
print(np.vstack((t1,t2)))
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]
 [12 13 14 15 16 17]
 [18 19 20 21 22 23]]'''

# 数组的拼接
import numpy as np
t1 = np.arange(18).reshape(3,6)
t2 = np.arange(12,24).reshape(3,4)
print(t1)
'''
[[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]
 [12 13 14 15 16 17]]'''
print(t2)
'''
[[12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]]'''

# 水平拼接   行数必须相同
print(np.hstack((t1,t2)))  
'''
[[ 0  1  2  3  4  5 12 13 14 15]
 [ 6  7  8  9 10 11 16 17 18 19]
 [12 13 14 15 16 17 20 21 22 23]]'''

2.数组的行列交换

数组水平或者竖直拼接很简单，但是拼接之前应该注意什么？

# 数组的拼接  行交换
import numpy as np

t = np.arange(12, 24).reshape(3, 4)
print(t)
'''
[[12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]]'''
print("="*30)
print(t[[0,2], :])  # 取出第1行和第3行
'''
[[12 13 14 15]
 [20 21 22 23]]'''
print("="*30)
print(t[[2,0], :])  # # 取出第3行和第1行
'''
[[20 21 22 23]
 [12 13 14 15]]'''
print("="*30)
# 第1行与第3行交换  行交换
t[[0,2], :] = t[[2,0], :]
print(t)
'''
[[20 21 22 23]
 [16 17 18 19]
 [12 13 14 15]]'''

# 数组的拼接  列交换
import numpy as np

t = np.arange(12, 24).reshape(3, 4)
print(t)
'''
[[12 13 14 15]
 [16 17 18 19]
 [20 21 22 23]]'''
print("="*30)
print(t[:, [0,2]])  # 取出第1列和第3列
'''
[[12 14]
 [16 18]
 [20 22]]'''
print("="*30)
print(t[:,[2,0]])  # # 取出第3列和第1列
'''
[[14 12]
 [18 16]
 [22 20]]'''
print("="*30)
# 第1列与第3列交换  列交换
t[:, [0,2]] = t[:,[2,0]]
print(t)
'''
[[14 13 12 15]
 [18 17 16 19]
 [22 21 20 23]]'''

3.numpy常用方法补充

获取最大值最小值的位置
np.argmax(t,axis=0)
np.argmin(t,axis=1)
创建一个对角线为1的正方形数组(方阵)：np.eye(3)

# 获取最大值最小值的位置
# np.argmax(t,axis=0)
# np.argmin(t,axis=1)
import numpy as np
import random


a = [random.randint(0,100) for i in range(20)]
a = np.array(a)
t = a.reshape(4,5)
print(t)
'''
[[  1  64  67  15  47]
 [ 18  74  57   1  54]
 [ 66  97  49  96  29]
 [ 54  66  89  13 100]]'''
print(np.argmax(t))  # t中最大值的位置
'''19'''
print(np.argmax(t,axis=0))  # 每一列最大值的位置
"""[2 2 3 2 3]"""
print(np.argmin(t,axis=0))  # 每一列最小值的位置
"""[0 0 2 1 2]"""
print(np.argmax(t,axis=1))  # 每一行最大值的位置
"""[2 1 1 4]"""
print(np.argmin(t,axis=1))  # 每一行最小值的位置
"""[0 3 4 3]"""

# 创建一个对角线为1的正方形数组(方阵)：np.eye(3)
import numpy as np

print(np.eye(3))
'''
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]'''
print(np.eye(1))
'''
[[1.]]'''
print(np.eye(0))
'''
[]'''
print(np.eye(5))
'''
[[1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0.]
 [0. 0. 1. 0. 0.]
 [0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 1.]]'''

4.numpy的copy

1.a=b 完全不复制，a和b相互影响
2.a = b.copy(),复制，a和b互不影响

# numpy的copy
# 1.a=b 完全不复制，a和b相互影响
# 2.a = b.copy(),复制，a和b互不影响
import numpy as np

# 浅拷贝
a = np.arange(20).reshape(4,5)
print(a)
'''
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]'''
b = a
b[1,1]=0
print(b)
'''
[[ 0  1  2  3  4]
 [ 5  0  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]'''
print(a)
'''
[[ 0  1  2  3  4]
 [ 5  0  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]'''

# 深拷贝
a1 = np.arange(20).reshape(4,5)
a2 = a1.copy()
a2[1,1]=0
print(a2)
'''
[[ 0  1  2  3  4]
 [ 5  0  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]'''
print(a1)
'''
[[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]'''

加油小羽哥

关注

3
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
数据分析第五讲 numpy

数据分析第五讲 numpy + pandas一、numpy中的数组操作1、numpy中数值的修改t = np.arange(20).reshape(4,5)t[:,0:2] = 0# numpy中数值的修改import numpy as npt = np.arange(20)print(t)'[ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19]'t1 = t.reshape(4, 5)print(t1)'''
复制链接

扫一扫