Numpy&pandas

Numpy

Numpy属性

创造并输出一个矩阵

import numpy as np
array = np.array([
    [1,2,3],
    [2,3,4]
])
print(array)
print('number of dim:',array.ndim) #几维数租
print('shape:',array.shape)  #几行几列
print('size:',array.size)  #元素个数
print(np.size(array))  #同上

Numpy创建array

import numpy as np
a = np.array([2,23,4],dtype=float) #dtype后跟数值类型
print(a)

生成全部为0,1的矩阵

b = np.zeros((3,4))
print(b)
b = np.ones((3,4))
print(b)
c = np.arange(10,20,2)
print(c)
#生成10-20 步长为2的数列
#[10 12 14 16 18]
d = np.arange(12).reshape((3,4))
print(d)
#生成三行四列的矩阵
e = np.linspace(1,10,6)
print(e)
#生成一段数列从1到10  有6个数  可以用reshape

Numpy基础运算

import numpy as np
a = np.array([10,20,30,40])
b = np.arange(4)
print(a-b)
#[10 19 28 37]

c = np.array([
    [1],
    [2],
    [3],
    [4]
])

print(a*c)
#[[ 10  20  30  40]
# [ 20  40  60  80]
# [ 30  60  90 120]
# [ 40  80 120 160]]
#逐个相乘
print(np.dot(a,c)) #矩阵相乘
a.dot(c)#同上 
#[300]

print(b**2)
#[0 1 4 9]

print(a*b)
#[  0  20  60 120]

print(b<3)
#输出b中哪些比3小
d = np.random.random((2,4))*10
print(d)
print(np.sum(d))  #元素和
print(np.min(d))  #最小值
print(np.max(d))  #最大值
print(np.sum(d,axis=1)) #axis=1按行求和  0按列
import numpy as np
a = np.arange(12,24).reshape((3,4))
print(a)
print(np.argmin(a)) #最小值所在的索引
print(np.argmax(a)) #最大值所在的索引
print(np.average(a)) #平均值
print(np.mean(a)) #平均值
print(np.cumsum(a)) #累加
#[ 12  25  39  54  70  87 105 124 144 165 187 210]
print(np.diff(a)) #每两个元素的差
print(np.nonzero(a)) #返回非零元素的行列值下标
print(np.sort(a)) #排序
print(np.transpose(a)) #转置
print(a.T) #转置
print(a.flat) #转变成一行列表

Numpy的索引

import numpy as np
a = np.arange(3,15).reshape((3,4))
print(a)
#[[ 3  4  5  6]
# [ 7  8  9 10]
# [11 12 13 14]]
print(a[2,1])#12
print(a[2][1])#12
print(a[2:]) #第二行的所有数
print(a[:,1]) #第一列的所有数

for row in a:
    print(row)
#迭代输出行
for col in a.T:
    print(col)
#迭代输出列

Numpy的array合并

import numpy as np
a = np.array([1,2,3])
b = np.array([4,5,6])
c = np.vstack((a,b)) #竖直合并
d = np.hstack((a,b)) #水平合并
print(c)
print(d)

#[[1 2 3]
# [4 5 6]]
#[1 2 3 4 5 6]

print(a.reshape((3,1))) #把行向量变成列向量
print(a[:,np.newaxis]) #同上

array分割

import numpy as np
a = np.arange(12).reshape((3,4))
print(np.split(a,2,axis=1)) #(array,分成几行或几列,对行分还是对列分)
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2,  3],
       [ 6,  7],
       [10, 11]])]
print(np.array_split(a,3,axis=1))#硬分
[array([[0, 1],
       [4, 5],
       [8, 9]]), array([[ 2],
       [ 6],
       [10]]), array([[ 3],
       [ 7],
       [11]])]

copy&deepcopy

python中没有指针 只想复制值可以 b=a.copy()

pandas

可以理解为字典类型的Numpy 有列名行名的矩阵列表

基本

a = pd.Series([1,2,3,4,66,3,2,12,np.nan])
print(a)
输出:
0     1.0
1     2.0
2     3.0
3     4.0
4    66.0
5     3.0
6     2.0
7    12.0
8     NaN
dtype: float64
date = pd.date_range("2022/10/1",periods=6)
print(date)
DatetimeIndex(['2022-10-01', '2022-10-02', '2022-10-03', '2022-10-04',
               '2022-10-05', '2022-10-06'],
              dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(data=np.random.randn(),index=date,columns=['a','b','c','d'])
print(df)
                   a         b         c         d
2022-10-01  1.263075  1.263075  1.263075  1.263075
2022-10-02  1.263075  1.263075  1.263075  1.263075
2022-10-03  1.263075  1.263075  1.263075  1.263075
2022-10-04  1.263075  1.263075  1.263075  1.263075
2022-10-05  1.263075  1.263075  1.263075  1.263075
2022-10-06  1.263075  1.263075  1.263075  1.263075
有行名和列名

df2.dtypes #每一列的形式

print(df.columns) #行名
print(df.index) #列名
print(df.values) #所有值
print(df.describe()) #描述矩阵信息
              a         b         c         d
count  6.000000  6.000000  6.000000  6.000000
mean  -0.380933 -0.380933 -0.380933 -0.380933
std    0.000000  0.000000  0.000000  0.000000
min   -0.380933 -0.380933 -0.380933 -0.380933
25%   -0.380933 -0.380933 -0.380933 -0.380933
50%   -0.380933 -0.380933 -0.380933 -0.380933
75%   -0.380933 -0.380933 -0.380933 -0.380933
max   -0.380933 -0.380933 -0.380933 -0.380933

print(df.sort_index(axis=1,ascending=False)) #倒序(列)
                   d         c         b         a
2022-10-01  1.416261  1.416261  1.416261  1.416261
2022-10-02  1.416261  1.416261  1.416261  1.416261
2022-10-03  1.416261  1.416261  1.416261  1.416261
2022-10-04  1.416261  1.416261  1.416261  1.416261
2022-10-05  1.416261  1.416261  1.416261  1.416261
2022-10-06  1.416261  1.416261  1.416261  1.416261

pandas选择数据

a = pd.DataFrame(np.arange(24).reshape((6,4)),pd.date_range("2022/10/1",periods=6,),['a','b','c','d'])
print(a)
             a   b   c   d
2022-10-01   0   1   2   3
2022-10-02   4   5   6   7
2022-10-03   8   9  10  11
2022-10-04  12  13  14  15
2022-10-05  16  17  18  19
2022-10-06  20  21  22  23

选择列名为‘b’的列

print(a.b)
print(a['b'])

选择0-2行

print(a[0:3])
print()
print(a['2022/10/1':'2022/10/3'])
            a  b   c   d
2022-10-01  0  1   2   3
2022-10-02  4  5   6   7
2022-10-03  8  9  10  11

select by loc

print(a.loc[:,'a']) #选择列名为a的
print(a.loc['2022/10/1']) #选择行名为‘2022/10/1’

select by iloc

print(a.iloc[3,1]) #第三行第一位

boolean indexing

print(a[a.a>8]) #显示a列大于8的数据,其他列也显示出来

pandas设置值

a.iloc[2,2]=200
a.loc['2022/10/1','B'] = 300
print(a)
a[a.A>8]=0 #A列大于8的所在行列都变成0
            A    B    C   D
2022-10-01  0  300    2   3
2022-10-02  4    5    6   7
2022-10-03  8    9  200  11
2022-10-04  0    0    0   0
2022-10-05  0    0    0   0
2022-10-06  0    0    0   0
a.A[a.A>8]=0 #A列大于8的所在列都变成0
            A   B   C   D
2022-10-01  0   1   2   3
2022-10-02  4   5   6   7
2022-10-03  8   9  10  11
2022-10-04  0  13  14  15
2022-10-05  0  17  18  19
2022-10-06  0  21  22  23
a['E'] = pd.Series('1',pd.date_range('2022/10/1',periods=6)) #新加一列
print(a)

pandas处理空置

a = pd.DataFrame(np.arange(24).reshape((6,4)),pd.date_range('2022/10/1',periods=6),['A','B','C','D'])
print(a)
a.iloc[0,2]=np.nan
a.iloc[1,3]=np.nan
print(a)
print(a.dropna(axis=0,how='any'))  #只是隐藏掉了空值所在的行,并没有删除 
#how={'all','any'} any是只要出现nan就隐藏行/列,all是要所在行都为nan才删除
print(a)
             A   B   C   D
2022-10-01   0   1   2   3
2022-10-02   4   5   6   7
2022-10-03   8   9  10  11
2022-10-04  12  13  14  15
2022-10-05  16  17  18  19
2022-10-06  20  21  22  23
             A   B     C     D
2022-10-01   0   1   NaN   3.0
2022-10-02   4   5   6.0   NaN
2022-10-03   8   9  10.0  11.0
2022-10-04  12  13  14.0  15.0
2022-10-05  16  17  18.0  19.0
2022-10-06  20  21  22.0  23.0
             A   B     C     D
2022-10-03   8   9  10.0  11.0
2022-10-04  12  13  14.0  15.0
2022-10-05  16  17  18.0  19.0
2022-10-06  20  21  22.0  23.0
             A   B     C     D
2022-10-01   0   1   NaN   3.0
2022-10-02   4   5   6.0   NaN
2022-10-03   8   9  10.0  11.0
2022-10-04  12  13  14.0  15.0
2022-10-05  16  17  18.0  19.0
2022-10-06  20  21  22.0  23.0
print(a.fillna(value=0)) #将nan值用0填充
print(a.isnull())  #是否有空值,空值所在位置为true
print(np.any(a.isnull()==True))  #这组数据有无空值,返回true or false  

pandas导入导出

import numpy as np
import pandas as pd
data = pd.read_csv('student.csv')
print(data)
#读取csv文件  csv文件需要在excel里另存为csv
#文件中有中文的话需要在代码中加入encoding='gb2312'
data.to_csv('student2.csv') 
#存储为csv文件

pandas合并concat

合并两个多个dataframe

df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])

res = pd.concat([df1,df2,df3],axis=0,ignore_index=True) #axis=0纵向合并,ignore_index忽略掉行名重新命名
print(res)
     a    b    c    d
0  0.0  0.0  0.0  0.0
1  0.0  0.0  0.0  0.0
2  0.0  0.0  0.0  0.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
5  1.0  1.0  1.0  1.0
6  2.0  2.0  2.0  2.0
7  2.0  2.0  2.0  2.0
8  2.0  2.0  2.0  2.0

join方法

df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'])
res = pd.concat([df1,df2])
print(res)
#默认会把没有值得地方用NaN填充
     a    b    c    d    e
0  0.0  0.0  0.0  0.0  NaN
1  0.0  0.0  0.0  0.0  NaN
2  0.0  0.0  0.0  0.0  NaN
0  NaN  1.0  1.0  1.0  1.0
1  NaN  1.0  1.0  1.0  1.0
2  NaN  1.0  1.0  1.0  1.0
res = pd.concat([df1,df2],join='inner')  #默认join是outer  inner可以理解为求交集
     b    c    d
0  0.0  0.0  0.0
1  0.0  0.0  0.0
2  0.0  0.0  0.0
0  1.0  1.0  1.0
1  1.0  1.0  1.0
2  1.0  1.0  1.0

pandas合并merge

import pandas as pd
left = pd.DataFrame({'key':['k1','k2','k3','k4'],
                     'A':['a1','a2','a3','a4'],
                     'b':['b1','b2','b3','b4'],
                     })
right = pd.DataFrame({'key':['k1','k2','k3','k4'],
                      'c':['c1','c2','c3','c4'],
                      'd':['d1','d2','d3','d4']
                      })
print(left)
print(right)
res = pd.merge(left,right,on='key')
print(res)
  key   A   b   c   d
0  k1  a1  b1  c1  d1
1  k2  a2  b2  c2  d2
2  k3  a3  b3  c3  d3
3  k4  a4  b4  c4  d4

pandas中plot画图

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.Series(np.random.randint(1,5,100))
print(data)
data.plot()
plt.show()
#一列数据

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JbiY6oyE-1666509099714)(C:\Users\lenovo\AppData\Roaming\Typora\typora-user-images\image-20221023145439865.png)]

data = pd.DataFrame(np.random.randn(100,4))
print(data)
data.cumsum()
data.plot()
plt.show()
#多列数据

[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lpW2qZuS-1666509099715)(C:\Users\lenovo\AppData\Roaming\Typora\typora-user-images\image-20221023145834187.png)]

plot methods:‘bar’ ‘hist’,‘box’,‘kde’,‘area’,‘scatter’…

scatter:散点图 plot:连线图
d2
2 k3 a3 b3 c3 d3
3 k4 a4 b4 c4 d4


## pandas中plot画图

```python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.Series(np.random.randint(1,5,100))
print(data)
data.plot()
plt.show()
#一列数据

[外链图片转存中…(img-JbiY6oyE-1666509099714)]

data = pd.DataFrame(np.random.randn(100,4))
print(data)
data.cumsum()
data.plot()
plt.show()
#多列数据

[外链图片转存中…(img-lpW2qZuS-1666509099715)]

plot methods:‘bar’ ‘hist’,‘box’,‘kde’,‘area’,‘scatter’…

scatter:散点图 plot:连线图

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值