Numpy
Numpy属性
创造并输出一个矩阵
import numpy as np
array = np.array([
[1,2,3],
[2,3,4]
])
print(array)
print('number of dim:',array.ndim) #几维数租
print('shape:',array.shape) #几行几列
print('size:',array.size) #元素个数
print(np.size(array)) #同上
Numpy创建array
import numpy as np
a = np.array([2,23,4],dtype=float) #dtype后跟数值类型
print(a)
生成全部为0,1的矩阵
b = np.zeros((3,4))
print(b)
b = np.ones((3,4))
print(b)
c = np.arange(10,20,2)
print(c)
#生成10-20 步长为2的数列
#[10 12 14 16 18]
d = np.arange(12).reshape((3,4))
print(d)
#生成三行四列的矩阵
e = np.linspace(1,10,6)
print(e)
#生成一段数列从1到10 有6个数 可以用reshape
Numpy基础运算
import numpy as np
a = np.array([10,20,30,40])
b = np.arange(4)
print(a-b)
#[10 19 28 37]
c = np.array([
[1],
[2],
[3],
[4]
])
print(a*c)
#[[ 10 20 30 40]
# [ 20 40 60 80]
# [ 30 60 90 120]
# [ 40 80 120 160]]
#逐个相乘
print(np.dot(a,c)) #矩阵相乘
a.dot(c)#同上
#[300]
print(b**2)
#[0 1 4 9]
print(a*b)
#[ 0 20 60 120]
print(b<3)
#输出b中哪些比3小
d = np.random.random((2,4))*10
print(d)
print(np.sum(d)) #元素和
print(np.min(d)) #最小值
print(np.max(d)) #最大值
print(np.sum(d,axis=1)) #axis=1按行求和 0按列
import numpy as np
a = np.arange(12,24).reshape((3,4))
print(a)
print(np.argmin(a)) #最小值所在的索引
print(np.argmax(a)) #最大值所在的索引
print(np.average(a)) #平均值
print(np.mean(a)) #平均值
print(np.cumsum(a)) #累加
#[ 12 25 39 54 70 87 105 124 144 165 187 210]
print(np.diff(a)) #每两个元素的差
print(np.nonzero(a)) #返回非零元素的行列值下标
print(np.sort(a)) #排序
print(np.transpose(a)) #转置
print(a.T) #转置
print(a.flat) #转变成一行列表
Numpy的索引
import numpy as np
a = np.arange(3,15).reshape((3,4))
print(a)
#[[ 3 4 5 6]
# [ 7 8 9 10]
# [11 12 13 14]]
print(a[2,1])#12
print(a[2][1])#12
print(a[2:]) #第二行的所有数
print(a[:,1]) #第一列的所有数
for row in a:
print(row)
#迭代输出行
for col in a.T:
print(col)
#迭代输出列
Numpy的array合并
import numpy as np
a = np.array([1,2,3])
b = np.array([4,5,6])
c = np.vstack((a,b)) #竖直合并
d = np.hstack((a,b)) #水平合并
print(c)
print(d)
#[[1 2 3]
# [4 5 6]]
#[1 2 3 4 5 6]
print(a.reshape((3,1))) #把行向量变成列向量
print(a[:,np.newaxis]) #同上
array分割
import numpy as np
a = np.arange(12).reshape((3,4))
print(np.split(a,2,axis=1)) #(array,分成几行或几列,对行分还是对列分)
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2, 3],
[ 6, 7],
[10, 11]])]
print(np.array_split(a,3,axis=1))#硬分
[array([[0, 1],
[4, 5],
[8, 9]]), array([[ 2],
[ 6],
[10]]), array([[ 3],
[ 7],
[11]])]
copy&deepcopy
python中没有指针 只想复制值可以 b=a.copy()
pandas
可以理解为字典类型的Numpy 有列名行名的矩阵列表
基本
a = pd.Series([1,2,3,4,66,3,2,12,np.nan])
print(a)
输出:
0 1.0
1 2.0
2 3.0
3 4.0
4 66.0
5 3.0
6 2.0
7 12.0
8 NaN
dtype: float64
date = pd.date_range("2022/10/1",periods=6)
print(date)
DatetimeIndex(['2022-10-01', '2022-10-02', '2022-10-03', '2022-10-04',
'2022-10-05', '2022-10-06'],
dtype='datetime64[ns]', freq='D')
df = pd.DataFrame(data=np.random.randn(),index=date,columns=['a','b','c','d'])
print(df)
a b c d
2022-10-01 1.263075 1.263075 1.263075 1.263075
2022-10-02 1.263075 1.263075 1.263075 1.263075
2022-10-03 1.263075 1.263075 1.263075 1.263075
2022-10-04 1.263075 1.263075 1.263075 1.263075
2022-10-05 1.263075 1.263075 1.263075 1.263075
2022-10-06 1.263075 1.263075 1.263075 1.263075
有行名和列名
df2.dtypes #每一列的形式
print(df.columns) #行名
print(df.index) #列名
print(df.values) #所有值
print(df.describe()) #描述矩阵信息
a b c d
count 6.000000 6.000000 6.000000 6.000000
mean -0.380933 -0.380933 -0.380933 -0.380933
std 0.000000 0.000000 0.000000 0.000000
min -0.380933 -0.380933 -0.380933 -0.380933
25% -0.380933 -0.380933 -0.380933 -0.380933
50% -0.380933 -0.380933 -0.380933 -0.380933
75% -0.380933 -0.380933 -0.380933 -0.380933
max -0.380933 -0.380933 -0.380933 -0.380933
print(df.sort_index(axis=1,ascending=False)) #倒序(列)
d c b a
2022-10-01 1.416261 1.416261 1.416261 1.416261
2022-10-02 1.416261 1.416261 1.416261 1.416261
2022-10-03 1.416261 1.416261 1.416261 1.416261
2022-10-04 1.416261 1.416261 1.416261 1.416261
2022-10-05 1.416261 1.416261 1.416261 1.416261
2022-10-06 1.416261 1.416261 1.416261 1.416261
pandas选择数据
a = pd.DataFrame(np.arange(24).reshape((6,4)),pd.date_range("2022/10/1",periods=6,),['a','b','c','d'])
print(a)
a b c d
2022-10-01 0 1 2 3
2022-10-02 4 5 6 7
2022-10-03 8 9 10 11
2022-10-04 12 13 14 15
2022-10-05 16 17 18 19
2022-10-06 20 21 22 23
选择列名为‘b’的列
print(a.b)
print(a['b'])
选择0-2行
print(a[0:3])
print()
print(a['2022/10/1':'2022/10/3'])
a b c d
2022-10-01 0 1 2 3
2022-10-02 4 5 6 7
2022-10-03 8 9 10 11
select by loc
print(a.loc[:,'a']) #选择列名为a的
print(a.loc['2022/10/1']) #选择行名为‘2022/10/1’
select by iloc
print(a.iloc[3,1]) #第三行第一位
boolean indexing
print(a[a.a>8]) #显示a列大于8的数据,其他列也显示出来
pandas设置值
a.iloc[2,2]=200
a.loc['2022/10/1','B'] = 300
print(a)
a[a.A>8]=0 #A列大于8的所在行列都变成0
A B C D
2022-10-01 0 300 2 3
2022-10-02 4 5 6 7
2022-10-03 8 9 200 11
2022-10-04 0 0 0 0
2022-10-05 0 0 0 0
2022-10-06 0 0 0 0
a.A[a.A>8]=0 #A列大于8的所在列都变成0
A B C D
2022-10-01 0 1 2 3
2022-10-02 4 5 6 7
2022-10-03 8 9 10 11
2022-10-04 0 13 14 15
2022-10-05 0 17 18 19
2022-10-06 0 21 22 23
a['E'] = pd.Series('1',pd.date_range('2022/10/1',periods=6)) #新加一列
print(a)
pandas处理空置
a = pd.DataFrame(np.arange(24).reshape((6,4)),pd.date_range('2022/10/1',periods=6),['A','B','C','D'])
print(a)
a.iloc[0,2]=np.nan
a.iloc[1,3]=np.nan
print(a)
print(a.dropna(axis=0,how='any')) #只是隐藏掉了空值所在的行,并没有删除
#how={'all','any'} any是只要出现nan就隐藏行/列,all是要所在行都为nan才删除
print(a)
A B C D
2022-10-01 0 1 2 3
2022-10-02 4 5 6 7
2022-10-03 8 9 10 11
2022-10-04 12 13 14 15
2022-10-05 16 17 18 19
2022-10-06 20 21 22 23
A B C D
2022-10-01 0 1 NaN 3.0
2022-10-02 4 5 6.0 NaN
2022-10-03 8 9 10.0 11.0
2022-10-04 12 13 14.0 15.0
2022-10-05 16 17 18.0 19.0
2022-10-06 20 21 22.0 23.0
A B C D
2022-10-03 8 9 10.0 11.0
2022-10-04 12 13 14.0 15.0
2022-10-05 16 17 18.0 19.0
2022-10-06 20 21 22.0 23.0
A B C D
2022-10-01 0 1 NaN 3.0
2022-10-02 4 5 6.0 NaN
2022-10-03 8 9 10.0 11.0
2022-10-04 12 13 14.0 15.0
2022-10-05 16 17 18.0 19.0
2022-10-06 20 21 22.0 23.0
print(a.fillna(value=0)) #将nan值用0填充
print(a.isnull()) #是否有空值,空值所在位置为true
print(np.any(a.isnull()==True)) #这组数据有无空值,返回true or false
pandas导入导出
import numpy as np
import pandas as pd
data = pd.read_csv('student.csv')
print(data)
#读取csv文件 csv文件需要在excel里另存为csv
#文件中有中文的话需要在代码中加入encoding='gb2312'
data.to_csv('student2.csv')
#存储为csv文件
pandas合并concat
合并两个多个dataframe
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True) #axis=0纵向合并,ignore_index忽略掉行名重新命名
print(res)
a b c d
0 0.0 0.0 0.0 0.0
1 0.0 0.0 0.0 0.0
2 0.0 0.0 0.0 0.0
3 1.0 1.0 1.0 1.0
4 1.0 1.0 1.0 1.0
5 1.0 1.0 1.0 1.0
6 2.0 2.0 2.0 2.0
7 2.0 2.0 2.0 2.0
8 2.0 2.0 2.0 2.0
join方法
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'])
res = pd.concat([df1,df2])
print(res)
#默认会把没有值得地方用NaN填充
a b c d e
0 0.0 0.0 0.0 0.0 NaN
1 0.0 0.0 0.0 0.0 NaN
2 0.0 0.0 0.0 0.0 NaN
0 NaN 1.0 1.0 1.0 1.0
1 NaN 1.0 1.0 1.0 1.0
2 NaN 1.0 1.0 1.0 1.0
res = pd.concat([df1,df2],join='inner') #默认join是outer inner可以理解为求交集
b c d
0 0.0 0.0 0.0
1 0.0 0.0 0.0
2 0.0 0.0 0.0
0 1.0 1.0 1.0
1 1.0 1.0 1.0
2 1.0 1.0 1.0
pandas合并merge
import pandas as pd
left = pd.DataFrame({'key':['k1','k2','k3','k4'],
'A':['a1','a2','a3','a4'],
'b':['b1','b2','b3','b4'],
})
right = pd.DataFrame({'key':['k1','k2','k3','k4'],
'c':['c1','c2','c3','c4'],
'd':['d1','d2','d3','d4']
})
print(left)
print(right)
res = pd.merge(left,right,on='key')
print(res)
key A b c d
0 k1 a1 b1 c1 d1
1 k2 a2 b2 c2 d2
2 k3 a3 b3 c3 d3
3 k4 a4 b4 c4 d4
pandas中plot画图
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.Series(np.random.randint(1,5,100))
print(data)
data.plot()
plt.show()
#一列数据
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JbiY6oyE-1666509099714)(C:\Users\lenovo\AppData\Roaming\Typora\typora-user-images\image-20221023145439865.png)]
data = pd.DataFrame(np.random.randn(100,4))
print(data)
data.cumsum()
data.plot()
plt.show()
#多列数据
[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-lpW2qZuS-1666509099715)(C:\Users\lenovo\AppData\Roaming\Typora\typora-user-images\image-20221023145834187.png)]
plot methods:‘bar’ ‘hist’,‘box’,‘kde’,‘area’,‘scatter’…
scatter:散点图 plot:连线图
d2
2 k3 a3 b3 c3 d3
3 k4 a4 b4 c4 d4
## pandas中plot画图
```python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
data = pd.Series(np.random.randint(1,5,100))
print(data)
data.plot()
plt.show()
#一列数据
[外链图片转存中…(img-JbiY6oyE-1666509099714)]
data = pd.DataFrame(np.random.randn(100,4))
print(data)
data.cumsum()
data.plot()
plt.show()
#多列数据
[外链图片转存中…(img-lpW2qZuS-1666509099715)]
plot methods:‘bar’ ‘hist’,‘box’,‘kde’,‘area’,‘scatter’…
scatter:散点图 plot:连线图