3.py
三个属性
import numpy as np
if __name__=="__main__":
array=np.array([[1,2,3],[4,5,6]])
print(f"dim:{array.ndim}")
print(f"shape:{array.shape}")
print(f"size:{array.size}")
4.py
array:创建数组
dtype:指定数据类型
zeros:创建数据全为0
ones:创建数据全为1
empty:创建数据接近0
arrange:按指定范围创建数据
linspace:创建线段
np.linspace(start,stop,num=40, endpoint=True, retstep=False, dtype=None)
生成一个[start,stop]左右均闭的等差数列,元素个数num默认为50个,endpoint=True默认包含stop, retstep=False默认不显示步长, dtype=None默认不改变数据类型。
import numpy as np
if __name__=="__main__":
array1=np.array([[1,2,3],[4,5,6]])
print(array1.dtype)#int默认32位,float默认32位
array2=np.array([[1,2,3],[4,5,6]],dtype=np.int16)#将数据指定为16位
print(array2.dtype)
array3=np.zeros((2,3))#shape要用元组()括起来
print(array3)
array4=np.ones((2,3),dtype=np.float16)
print(array4)
array5=np.empty((2,3))#虽然称为kong(empty),返回的却是接近为0的数字
print(f"array5:\n{array5}")
array6=np.arange(0,16,2)#start=0,end=15,stripe=2
print(f"resize:\n{array6.resize(4,4)}")#resize函数不会有返回值,故而print为None
print(f"reshape:\n{array6.reshape(4,4)}")#reshape会讲维度变换后的矩阵返回
array7=np.linspace(1,10,11)#start=1,end=10,整个数组有11个元素,每个元素等间距
print(f"array7:\n{array7}")
5.py
import numpy as np
if __name__=="__main__":
array1=np.array([[1,1],[1,0]],dtype=np.float64)
array2=np.arange(0,4,dtype=np.float64).reshape(2,2)
#按位运算
print(f"a1-a2:\n{array1-array2}")
print(f"power:\n{array2**2}")#每个元素平方
print(f"sin:\n{10*np.sin(array1)}")
print(f"按位乘法方式1:\n{array1*array2}")
print(f"按位乘法方式2:\n{np.multiply(array1,array2)}")
#矩阵乘法
print(f"形式1:\n{np.dot(array1, array2)}")
print(f"形式2:\n{np.matmul(array1, array2)}")
print(f"形式3:\n{array1.dot(array2)}")
#sum,max,min
array3=np.random.random((2,3))#生成0到1的随机数
print(array3)
print(f"所有元素求和:\n{array3.sum()}")
print(f"所有元素最大值:\n{array3.max()}")
print(f"所有元素最小值:\n{array3.min()}")
#sum,max,min在某一维度下
#axis=0表示压缩行(操作后维度为列号),axis=1表示压缩列(操作后维度为行号)
print(f"对每一行求和:\n{array3.sum(axis=1)}")
print(f"对每一列求最大:\n{array3.max(axis=0)}")
print(f"对每一行求最小:\n{array3.min(axis=1)}")
6.py
import numpy as np
if __name__=="__main__":
A=np.arange(0,12).reshape((3,4))
print(f"矩阵的最大值下标:\n{np.argmax(A)}")
print(f"矩阵的最小值下标:\n{np.argmin(A)}")
print(f"输出列的均值:\n{np.mean(A,axis=0)}")
print(f"矩阵的均值:\n{np.average(A)}")
print(f"矩阵的中位数:\n{np.median(A,axis=1)}")
# 生成的每一项矩阵元素均是从原矩阵首项累加到对应项的元素之和
print(np.cumsum(A))
# 该函数计算的便是每一行中后一项与前一项之差
print(np.diff(A))
#返回两个数组,数组相同索引的元素组成矩阵元素的位置索引,表示该位置索引的元素
# 在矩阵中非0
print(np.nonzero(A))
B=np.arange(12,0,-1).reshape((3,4))
print(f"对每一行进行排序:\n{np.sort(B)}")
print(f"矩阵转置1:\n{np.transpose(B)}")
print(f"矩阵转置2:\n{B.T}")
#对矩阵的元素做max和min的限制,大于max的会被设为max,小于min的会被设为min
print(f"np.clip:\n{np.clip(A,0,10)}")
7.py
import numpy as np
if __name__=="__main__":
A=np.arange(3,15).reshape((3,4))
print(A.flatten())#返回列表
print(A.flat)
print("对矩阵的每个元素进行输出")
for item in A.flat:
print(item)
8.py
import numpy as np
if __name__=="__main__":
A=np.array([[1,1,1]])
B=np.array([[2,2,2]])
C=A[:,:,np.newaxis]
D=np.concatenate((A,B),axis=0)
F=np.concatenate((A, B), axis=1)
G=np.array([1,1,1])
H=np.array([2,2,2])
I=np.concatenate((G,H),axis=0)
print(f"vstack:\n{np.vstack((A,B))}")
print(f"hstack:\n{np.hstack((A,B))}")
print(f"newaxis:\n{C}")#np.newaxis放哪就是新增那一维度,可以理解为在那一维打上括号
print(f"after_newaxis_shape:\n{C.shape}")
#concatenate 根据np.shape对应的维度来拼接,
#对于array(一维),np.shape=(n,),np.concatenate(axis=0)就是将n对应的维拼接起来(看起来是按列)
#对于matrix(多维),np.shape=(n,m),np.np.concatenate(axis=0)就是将n对应维拼接起来(看起来是按行)
#对于matrix(多维),np.shape=(n,m),np.np.concatenate(axis=1)就是将m对应维拼接起来(看起来是按列)
print(f"matrix concatenate axis=0:\n{D}")
print(f"matrix concatenate axis=1:\n{F}")
print(f"array concatenate axis=0:\n{I}")
print(np.dot(G, np.vstack((A, B)).T))
pandas部分
11.py
import pandas as pd
import numpy as np
#创建一个Series
s=pd.Series([1,2,3,np.nan,22])
print(s)
dates=pd.date_range("20211114",periods=7)
print(dates)
#设置行名(索引)和列名
df=pd.DataFrame(np.random.randn(7,4),index=dates,columns=['a','b','c','d'])
print(df)
#默认行号和列号为0,如果没设置的话
df1=pd.DataFrame(np.arange(28).reshape((7,4)),index=dates)
print(df1)
#键值对的形式定义
df2=pd.DataFrame({'A':1.,
'B':pd.Timestamp('20130102'),
'C':pd.Series(1, index=list(range(4)), dtype='float32'),
'D':np.array([3]*4,dtype='int32'),
'E':pd.Categorical(["test","train","test", "train"]),
'F':'foo'},index=np.arange(2,6))#C中会返回NaN,因为Series索引号对不上
print(df2)
#返回列的类型
print(df2.dtypes)
#返回行的索引
print(df2.index)
#返回列的名词
print(df2.columns)
#以列表的形式返回元素值
#每个子列表为一行
print(df2.values)
print(type(df2.values[0][1]))
#返回数据描述
print(df2.describe())
print(df2.describe(include="all"))
#返回数据的转置
print(df2.T)
#按列排序,逆序
print(df2.sort_index(axis=1,ascending=False))
#按行排序,逆序
print(df2.sort_index(axis=0,ascending=False))
#按值排序
print(df2.sort_values(by='E'))
12.py
import pandas as pd
import numpy as np
date=pd.date_range("20211114",periods=7)
df=pd.DataFrame(data=np.arange(28).reshape((7,4)),index=date,columns=['a','b','c','d'])
print(df)
#选出一列
print(df.a)
print(df['a'])
#选出行
print("*"*20)
print(df[0:1])#区间左闭右开
print(type(df[0:1]))#选出的为dataframe类型
print(df["20211114":"20211116"])#区间左闭右闭
#select by label
#选出行
print(type(df.loc["20211116"]))#Series类型
print(df.loc["20211116"])
print(type(df.loc[["20211116","20211117"],:]))#dataframe类型
print(df.loc[["20211116","20211117"],:])
#选出列
print(df.loc[:,['a','b']])
#使用使用loc选出行和列
print(df.loc[["20211116","20211114"],["a","b"]])
#select by postion
print("*"*30)
print(df.iloc[:,3])
print(type(df.iloc[:,3]))#Series
#选出一个具体元素
print(df.iloc[3,1])
#连续切片
print(df.iloc[3:5,1:3])
#不连续切片
print(df.iloc[[1,3,5],1:3])
#bool选择
print(df[df.a>8])#打印出所有a元素大于8的行
13.py
import pandas as pd
import numpy as np
dates=pd.date_range("20211115",periods=7)
df=pd.DataFrame(np.arange(28).reshape((7,4)),index=dates,columns=['a','b','c','d'])
df.iloc[2,2:4]=111
print(df)
df.loc["20211116",'a']=222
print(df)
dfc=df.copy()
dfc[dfc.b>8]=0
print(dfc)
df.b[df.a>8]=0
print(df)
df['e']=np.nan
df['f']=pd.Series(range(7),index=dates)
print(df)
14.py
import numpy as np
import pandas as pd
dates=pd.date_range("20211116",periods=7)
df=pd.DataFrame(np.arange(28).reshape((7,4)),index=dates,columns=['a','b','c','d'])
df.iloc[2,1:3]=np.nan
print(df)
print("*"*80)
df1=df.copy()
print(df1.dropna(axis=0,how="all"))
print(df1.dropna(axis=0,how="any"))
df2=df.copy()
print(df2.dropna(axis=1,how="all"))
print(df2.dropna(axis=1,how="any"))
print("*"*80)
df3=df.copy()
print(df3.fillna(value=0))
print("*"*80)
print(df.isnull())
print(np.any(df.isnull()==True))
16.py
import numpy as np
import pandas as pd
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['a', 'b', 'c', 'd'])
df3 = pd.DataFrame(np.ones((3, 4))*2, columns=['a', 'b', 'c', 'd'])
res=pd.concat([df1,df2,df3],axis=0)
print(res)#保留index
res=pd.concat([df1,df2,df3],ignore_index=True,axis=0)
print(res)#对index重新进行排列
# join, ['inner', 'outer']
df1 = pd.DataFrame(np.ones((3, 4))*0, columns=['a', 'b', 'c', 'd'], index=[1, 2, 3])
df2 = pd.DataFrame(np.ones((3, 4))*1, columns=['b', 'c', 'd', 'e'], index=[2, 3, 4])
res=pd.concat([df1,df2],ignore_index=True,axis=0,join="outer")
print(res)
res=pd.concat([df1,df2],join="inner",ignore_index=True)
print(res)
#使用df1的index进行左右合并,新版本已经删除
# res=pd.concat([df1,df2],axis=1,join_axes=[df1.index])
# print(res)
res=df1.append([df2,df3],ignore_index=True)
print(res)
s1=pd.Series([1,2,3,4],index=['a','b','c','d'])
res=df1.append(s1,ignore_index=True)
print(res)
res=pd.concat([df1,df2],ignore_index=True)
print(res)#默认是outer
17.py
import pandas as pd
import numpy as np
left = pd.DataFrame({'key':['K0', 'K1', 'K2', 'K3'],
'A':['A0', 'A1', 'A2', 'A3'],
'B':['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key':['K0', 'K1', 'K2', 'K4'],
'A':['C0', 'C1', 'C2', 'C3'],
'B':['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
res=pd.merge(left,right,on='key')#默认是inner
print(res)
res=pd.merge(left,right,on="key",how="outer")
print(res)#outer填充nan
#两个key的1情况
left = pd.DataFrame({'key1':['K0', 'K0', 'K1', 'K2'],
'key2':['K0', 'K1', 'K0', 'K1'],
'A':['A0', 'A1', 'A2', 'A3'],
'B':['B0', 'B1', 'B2', 'B3']})
right = pd.DataFrame({'key1':['K0', 'K1', 'K1', 'K2'],
'key2':['K0', 'K0', 'K0', 'K0'],
'A':['C0', 'C1', 'C2', 'C3'],
'B':['D0', 'D1', 'D2', 'D3']})
print(left)
print(right)
res=pd.merge(left,right,on=["key1","key2"],how="inner")
print(res)
df1 = pd.DataFrame({'col1':[0,1],'col_left':['a', 'b']})
df2 = pd.DataFrame({'col1':[1, 2, 2],'col_right':[2, 2, 2]})
res=pd.merge(df1,df2,on='col1',how="outer",indicator=True)
print(res)#indicator=True _merge表示合并的数据来自左边还是右边
res = pd.merge(df1, df2, on='col1', how='outer', indicator='indicator_col') # 添加指示说明
print(res)#将_merge列名改为indicator_col
#按照index
print(left)
print(right)
res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
print(res)
# res = pd.merge(left, right, left_index=True, right_index=True, how='inner')
# print(res)
#区分列
boys = pd.DataFrame({'k':['K0', 'K1', 'K2'], 'age':[1, 2, 3]})
girls = pd.DataFrame({'k':['K0', 'K0', 'K2'], 'age':[4, 5, 6]})
print(boys)
print(girls)
res=pd.merge(boys,girls,on='k',suffixes=['_boy',"_girls"],how="inner")
print(res)
res=pd.merge(boys,girls,on='k',suffixes=['_boy',"_girls"],how="outer")
print(res)
18.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
data=pd.Series(np.random.randn(1000),index=np.arange(1000))
data=data.cumsum()
# data.plot()
# plt.show()
# print(data)
data=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list("ABCD"))
print(data.head(5))#默认也是输出前五行
data=data.cumsum()
ax=data.plot.scatter(x="A",y="B",color="DarkBlue",label="class 1")
data.plot.scatter(x="A",y="C",color="DarkGreen",label="class 2",ax=ax)
plt.show()