2021-08-30 机器学习数据处理之numpy基础

import numpy as np

t11 = np.array([1,2,3,4])
type(t1)
numpy.ndarray
t22 = np.array(range(10))
print("t2=np.array(range(10)) = ",t2)
print("type(t2)=",type(t2))

t33 = np.arange(10)
print("t3=np.arange(10) = ",t3)
print("type(t3)=",type(t3))
t2=np.array(range(10)) =  [[4394029  320053    5931   46245]
 [7860119  185853   26679       0]
 [5845909  576597   39774  170708]
 ...
 [ 142463    4231     148     279]
 [2162240   41032    1384    4737]
 [ 515000   34727     195    4722]]
type(t2)= <class 'numpy.ndarray'>
t3=np.arange(10) =  [0 1 2 3 4 5 6 7 8 9]
type(t3)= <class 'numpy.ndarray'>
t22.reshape(2,5)
array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])
us_file_path = "./youtube_video_data/US_video_data_numbers.csv"
uk_file_path = "./youtube_video_data/GB_video_data_numbers.csv"

t1 = np.loadtxt(us_file_path,delimiter=",",dtype="int",unpack=True)
#unpack参数表示 矩阵的转置(把列向量转换成行向量)
t2 = np.loadtxt(us_file_path,delimiter=",",dtype="int")

print('t1:',t1,'\n')
print('t2',t2,'\n')

print("*"*20)
t1: [[4394029 7860119 5845909 ...  142463 2162240  515000]
 [ 320053  185853  576597 ...    4231   41032   34727]
 [   5931   26679   39774 ...     148    1384     195]
 [  46245       0  170708 ...     279    4737    4722]] 

t2 [[4394029  320053    5931   46245]
 [7860119  185853   26679       0]
 [5845909  576597   39774  170708]
 ...
 [ 142463    4231     148     279]
 [2162240   41032    1384    4737]
 [ 515000   34727     195    4722]] 

********************
t22+1
array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
t22**2
array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81])
t=np.arange(24).reshape(4,6)
t
array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])
t<12
array([[ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True],
       [False, False, False, False, False, False],
       [False, False, False, False, False, False]])
t[t<12]=1
t
array([[ 1,  1,  1,  1,  1,  1],
       [ 1,  1,  1,  1,  1,  1],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])
np.where(t<12,0,12)
array([[ 0,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0],
       [12, 12, 12, 12, 12, 12],
       [12, 12, 12, 12, 12, 12]])
t
array([[ 1,  1,  1,  1,  1,  1],
       [ 1,  1,  1,  1,  1,  1],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])
#.clip(a,b):裁剪功能,将矩阵中小于a的全部赋值成a,大于b的全部赋值为b,一般a≤b,但实际上b可以小于a
t.clip(12,19)
array([[12, 12, 12, 12, 12, 12],
       [12, 12, 12, 12, 12, 12],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 19, 19, 19, 19]])
t[3,3:6]=9999
t
array([[   1,    1,    1,    1,    1,    1],
       [   1,    1,    1,    1,    1,    1],
       [  12,   13,   14,   15,   16,   17],
       [  18,   19,   20, 9999, 9999, 9999]])
t
array([[   1,    1,    1,    1,    1,    1],
       [   1,    1,    1,    1,    1,    1],
       [  12,   13,   14,   15,   16,   17],
       [  18,   19,   20, 9999, 9999, 9999]])
t.sum()
30153
t.sum(axis=0)
array([   32,    34,    36, 10016, 10017, 10018])
t.sum(0)
array([   32,    34,    36, 10016, 10017, 10018])
t.sum(axis=1)
array([    6,     6,    87, 30054])
t.sum(1)
array([    6,     6,    87, 30054])
  • 中值mean平均值median没有必然的关系。
  • 中值是将所给的一组数从小到大或从大到小排列,奇数个数的话取中间的数字,偶数个数的话取中间两个数的平均数;而平均值就是把这组数相加,然后除以这组数的个数。
  • 中值的优点是不受偏大或偏小数据的影响,很多情况下用它代表全体数据的一般水平更合适。如果数列中存在极端变量值,用中位数做代表值就比平均数更好。
#均值
t.mean()
1256.375
t.mean(0)#小括号内简写了: axis=0
array([   8.  ,    8.5 ,    9.  , 2504.  , 2504.25, 2504.5 ])
t.mean(1)
array([1.000e+00, 1.000e+00, 1.450e+01, 5.009e+03])
#中值
np.median(t)
6.5
np.median(t,axis=0)
array([6.5, 7. , 7.5, 8. , 8.5, 9. ])
np.median(t,axis=1)
array([1.0000e+00, 1.0000e+00, 1.4500e+01, 5.0095e+03])
t
array([[   1,    1,    1,    1,    1,    1],
       [   1,    1,    1,    1,    1,    1],
       [  12,   13,   14,   15,   16,   17],
       [  18,   19,   20, 9999, 9999, 9999]])
#最大值
t.max()
9999
t.max(axis=0)
array([  18,   19,   20, 9999, 9999, 9999])
t.min()
1
t.min(axis=0)
array([1, 1, 1, 1, 1, 1])
#极值:最大值和最小值的差值
np.ptp(t)
9998
np.ptp(t,axis=0)
array([  17,   18,   19, 9998, 9998, 9998])
#标准差
t.std()
3304.409324580567
t.std(axis=0)
array([   7.31436942,    7.79422863,    8.27647268, 4327.24404211,
       4327.10026317, 4326.95652278])
t.std(axis=1)
array([0.00000000e+00, 0.00000000e+00, 1.70782513e+00, 4.99000003e+03])

import numpy as np
from matplotlib import  pyplot as plt

# us_file_path = "./youtube_video_data/US_video_data_numbers.csv"
uk_file_path = "./youtube_video_data/GB_video_data_numbers.csv"

# t1 = np.loadtxt(us_file_path,delimiter=",",dtype="int",unpack=True)
t_uk = np.loadtxt(uk_file_path,delimiter=",",dtype="int")

print('最初的t_uk:',t_uk)
print('t_uk[:,1]<=500000:',t_uk[:,1]<=500000)
print('t_uk[t_uk[:,1]<=500000]:',t_uk[t_uk[:,1]<=500000])

#选择喜欢书比50万小的数据
t_uk = t_uk[t_uk[:,1]<=500000]
print('t_uk:',t_uk)

t_uk_comment = t_uk[:,-1]
t_uk_like = t_uk[:,1]

plt.figure(figsize=(20,8),dpi=80)
plt.scatter(t_uk_like,t_uk_comment)

plt.show()
最初的t_uk: [[7426393  782040   13548     705]
 [ 494203    2651    1309       0]
 [ 142819   13119     151    1141]
 ...
 [ 109222    4840      35     212]
 [ 626223   22962     532    1559]
 [  99228    1699      23     135]]
t_uk[:,1]<=500000: [False  True  True ...  True  True  True]
t_uk[t_uk[:,1]<=500000]: [[ 494203    2651    1309       0]
 [ 142819   13119     151    1141]
 [1580028   65729    1529    3598]
 ...
 [ 109222    4840      35     212]
 [ 626223   22962     532    1559]
 [  99228    1699      23     135]]
t_uk: [[ 494203    2651    1309       0]
 [ 142819   13119     151    1141]
 [1580028   65729    1529    3598]
 ...
 [ 109222    4840      35     212]
 [ 626223   22962     532    1559]
 [  99228    1699      23     135]]

在这里插入图片描述

数据拼接

t33=np.arange(12).reshape(2,6)
t44=np.arange(12,24).reshape(2,6)
print('t33:\n',t33,'\n\n','t44:\n',t44)
t33:
 [[ 0  1  2  3  4  5]
 [ 6  7  8  9 10 11]] 

 t44:
 [[12 13 14 15 16 17]
 [18 19 20 21 22 23]]
#竖直拼接
np.vstack((t33,t44))
array([[ 0,  1,  2,  3,  4,  5],
       [ 6,  7,  8,  9, 10, 11],
       [12, 13, 14, 15, 16, 17],
       [18, 19, 20, 21, 22, 23]])
#横向拼接
np.hstack((t33,t44))
array([[ 0,  1,  2,  3,  4,  5, 12, 13, 14, 15, 16, 17],
       [ 6,  7,  8,  9, 10, 11, 18, 19, 20, 21, 22, 23]])
t
array([[   1,    1,    1,    1,    1,    1],
       [  16,   13,   15,   12,   14,   17],
       [   1,    1,    1,    1,    1,    1],
       [9999,   19, 9999,   18,   20, 9999]])
#行交换
t[[1,2],:]=t[[2,1],:]
t
array([[   1,    1,    1,    1,    1,    1],
       [   1,    1,    1,    1,    1,    1],
       [  16,   13,   15,   12,   14,   17],
       [9999,   19, 9999,   18,   20, 9999]])
#列交换
t[:,[0,4]]=t[:,[4,0]]
t
array([[   1,    1,    1,    1,    1,    1],
       [   1,    1,    1,    1,    1,    1],
       [  14,   13,   15,   12,   16,   17],
       [  20,   19, 9999,   18, 9999, 9999]])
#numpy获取最大值最小值的下标
np.argmax(t)#第一个最大值的位置
20
np.argmax(t,axis=0)
array([3, 3, 3, 3, 3, 3])
np.argmax(t,axis=1)
array([0, 0, 5, 2])
np.argmin(t,axis=1)
array([0, 0, 3, 3])
# 创建3×4全0的数组: 
np.zeros((3,4))
array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])
# 创建3×4全1的数组: 
np.ones((3,4))
array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])
# 创建一个对角线为1的3x3正方形数组(方阵):
np.eye(3)
array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])
np.random.rand(2,3,2)
array([[[0.35950567, 0.08177032],
        [0.89808166, 0.73285583],
        [0.23462587, 0.46792952]],

       [[0.02080821, 0.55948812],
        [0.25205097, 0.75559711],
        [0.93791326, 0.24709769]]])

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值