Numpy & Pandas
Numpy:是数值计算的扩展包,它能高效处理N维数组,复杂函数,线性代数.
Panadas:是做数据处理。是python的一个数据分析包
01
# numpy和pandas有什么用?
在python在科学运算当中最为重要的两个模块,做数据分析必不可少,
简言之:用了会使得计算变得特别快
02
# 安装numpy和pandas
当然使用anaconda啦,当然直接pip install numpy或者 pip install pandas
03
# numpy的简单属性
基本属性
import numpy as np
array = np.array([[1,2,3],[4,5,6]]) # 定义一个数组
print(array) # 打印这个数组
print('number of dim:',array.ndim) # 查看数组的维度
print('shape:',array.shape) # 查看矩阵维度的多少
print('size',array.size) # 查看元素
numpy创建的array
import numpy as np
a = np.array([2,3,4],dtype = np.int32) # 定义一个列表,dtype定义数据的格式为int32,默认是int64
# dtype还有float32,或者16
print(a.dtype)
# 定义2维列表
b = np.array([[2,3,4],
[4,5,6]])
# 定义0矩阵
c = np.zeros((3,4)) # 生成3行4列的0矩阵
print(c)
# 定义单位矩阵
d = np.ones((3,4),dtype=np.int32) #生成3行4列的0矩阵
print(d)
# 定义空矩阵(实际是一个几乎接近0的数)
e = np.empty((3,4))
print(e)
# 生成有序的矩阵
f = np.arange(10,22,1).reshape((3,4)) # 10到22,间隔为1,生成为3行4列
print(f)
# 生成线段
g = np.linspace(1,10,6).reshape((2,3)) # 1到10,6个数,生成为2行3列
print(g)
numpy的基础运算
import numpy as np
a = np.array([10,20,30,40])
b = np.arange(4)
print(a,b)
c = a-b # 逐个的相加
print(c)
d = a+b # 逐个的相减
print(d)
e = b**2 # 计算平方
print(e)
f = 10*np.sin(a) # 计算三角函数,并*10
print(f)
print(b<3) # 小于3,返回列表的布尔值
# 矩阵的运算
p = np.array([[1,1],[0,1]])
q = np.arange(4).reshape((2,2))
print(p)
print(q)
r = p*q # 矩阵的乘法,逐个相乘
r_dot = np.dot(p,q) # 正规的矩阵运算
r_dot_2 = p.dot(q) # 等价表达
print(r)
print(r_dot)
w = np.random.random((2,4)) # 产生一个随机矩阵,第一个random是引用包名,第二个是函数名
print(w)
w1 = np.sum(w,axis=0)# 求列和,axis参数1,0表示行、列
print(w1)
w2 = np.min(w,axis=0)# 求列最小,axis参数1,0表示行、列
print(w2)
w3 = np.max(w,axis=0)# 求列最大,axis参数1,0表示行、列
print(w3)
import numpy as np
A = np.arange(2,14).reshape((3,4))
print(A)
print(np.argmin(A))# 计算最小值的索引
print(np.argmax(A))# 计算最大值索引
print(A.mean()) # 计算平均值
print(np.average(A))# 计算平均值
print(np.median(A)) # 计算中位数
print(np.cumsum(A))# 逐步的累加
print(np.diff(A))# 逐步的减,就一阶差分
print(np.nonzero(A)) # 找出非零的数,输出值为两个array,表示行列的索引,相当于输出的是非零的坐标
B = np.arange(14,2,-1).reshape((3,4)) # 倒序,注意要设置步长
print(B)
print(np.sort(B))# 按照竹行排序
print(np.transpose(B)) # 转置
print((B.T).dot(B))# B的平方
print(np.clip(B,5,9))# 最小值是5,最大值是9,截尾,将小于5转化为5,大于9的数转化为9,其余保持不变
print(np.mean(A,aixs=0)) # 所有的矩阵都可以指定行计算还是列计算
# numpy的索引
import numpy as np
A = np.arange(3,15).reshape((3,4))
print(A)
print(A[1][1])# 找出索引为第一行、第一列的值,其中A[1][1]也可以写成A[1,1]
print(A[1,1])
print(A[1,:])# 找出第一行的所有数
print(A[1,1:3])# 找出第一行的,1到3列的两个数,注意:python的启始位置是0哦
print('=='*30)
print(A.flatten())
for row in A:
print(row)# 迭代每一行
for column in A.T:
print(column) # 先转置,行变列,这就样迭代列
for item in A.flat:
print(item)# 先转变为一行的矩阵,再逐个打印
numpy的array合并
import numpy as np
A = np.array([1,1,1]) # 序列
B = np.array([2,2,2]) # 序列
C = np.vstack((A,B))# vertical stack垂直合并
D = np.hstack((A,B))# horizontal stack水平合并
print(C)
print(D)
print(A.shape,C.shape)
print(A.shape,D.shape)
print(A.T.shape) # 这点说明了,转置不能把一个序列变成一个矩阵
print(A[:,np.newaxis]) # 新加维度,由1行3列,变成了3行1列
E = np.concatenate((A,B,B,A),axis=0) # 多个合并,axis=0指定在对行合并
print(E)
F = np.array([1,1,1])[:,np.newaxis]
print(F)
G = np.array([2,2,2])[:,np.newaxis]
print(np.concatenate((F,G,F,G),axis=1)) # 多个合并,axis=1指定在对列合并
numpy的分割
import numpy as np
A = np.arange(12).reshape((3,4))
print(A)
print(np.split(A,2,axis=1))# 传入数组A,分割成2片,并且对列分块
print(np.array_split(A,3,axis=1)) # 不等分割,分成3块,并且对列分块
print(np.vsplit(A,3))# 纵向分割
print(np.hsplit(A,2))# 横向分割
numpy的copy&deep copy
对于numpy的赋值,看下边的例子,将a的值赋值给b和c,将b的值赋值给d,这时候a,b,c,d已经关联起来了,改变他们任何之歌元素,同时a,b,c,d都会改变
import numpy as np
a = np.arange(4)
print(a)
b=a
c=a
d=b
a[0]=11
print(a)
print(b is a) # 判断两个元素是否相同
print(b)
print(c is a )
print(c)
print(d is a)
print(d)
d[1:3]=[22,33]
print(d)
print(a)
# 实现深度copy,这时候他们不在关联
b = a.copy # deep copy
print(b)
a[3]=44
print(a)
print(b)
Part4
Pandas
Pandas的基本介绍
import pandas as pd
import numpy as np
# 创建一个pd序列
s = pd.Series([1,3,6,np.nan,44,1])
print(s)
dates = pd.date_range('20200820',periods=6) # 行索引
print(dates)
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=('a','b','c','d')) # 列索引
print(df)
df1=pd.DataFrame(np.arange(12).reshape((3,4)))
print(df1)
df2=pd.DataFrame({'A':1,'B':pd.Timestamp('20130102'),'C':pd.Series(1,index=list(range(4)),dtype='float32'),'D':np.array([3]*4,dtype='int32'),'E':pd.Categorical(['test','train','test','train']),'F':'foo'})
print(df2)
print(df2.dtypes) # 查看数据类型
print(df2.index) # 查看行索引
print(df2.columns)# 查看列索引
print(df2.values) # 查看值
df2.describe() # 查看数据中,数值型数据的基本情况
print(df2.T) # 数据框的转置
print(df2.sort_index(axis=1,ascending=False))
# 指定索引按列,并且进行倒序排序
print(df2.sort_values(by='E'))
# 指定按值排列,指定了第E的列
●pandas的数据选取
import pandas as pd
import numpy as np
dates=pd.date_range('20200820',periods=6)
df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns['A','B','C','D'])
print(df)
print(df['A'],df.A)# df['A']与df.A是一样的
print(df[0:3],df['20200821':'20200822'])# 按切片来选择
# 跟高级的一种方法 select by label:loc
print(df.loc['20200821'])
print(df.loc['20200821',['A','B']])
# select by position :iloc
print(df.iloc[3,1])# 找第三行,第一列
print(df.iloc[[1,3,5],1:3])# 找1,3,5行,1到3列
# mixed selection :ix(现在已经弃用了)
# print(df.ix[:3,['A','C']])
# Boolean indexing
print(df)
print(df[df.A>8])
● pandas设置值
如何在特定的位置修改值
import pandas as pd
import numpy as np
dates = pd.date_range('20200822',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
print(df)
df.iloc[2,2]=1111 # 直接的位置形式改变值
print(df)
df.loc['20200824','B']=2222# 标签的形式改变
print(df)
df[df.A>4]=88 # 大于4的全部赋值为88
print(df)
df.A[df.A>4]=99 # 只改变A列
print(df)
df['F']=np.nan # 新增F 列,并且赋值为NaN
print(df)
df['E']=pd.Series([1,2,3,4,5,6],index=pd.date_range('20200822',periods=6))
print(df)
●pandas处理丢失数据
import pandas as pd
import numpy as np
dates = pd.date_range('20200820',periods=6)
df = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df.iloc[0,1]=np.nan
df.iloc[1,2]=np.nan
print(df)
# 如果行或者列有丢失数据,直接丢掉
print(df.dropna(axis=0,how='any')) # 按行丢掉 how={'any','all'},how='all'的情况是,当一行或者一列都是NaN的时候才删去
# 填上NaN
print(df.fillna(value=0)) # 所有缺失值都替换为0
# 检查null
print(df.isnull())
print(np.any(df.isnull())==True)
● pandas导入导出
import pandas as pd
import numpy as np
data = pd.read_csv('1.csv') # pd会自动加上索引
print(data)
data.to_pickle('1_.pickle') # 保存为pickle文件
● pandas合并concat
import pandas as pd
import numpy as np
# concatenating 串联
# 创建3个dataframe
df1 = pd.DataFrame(np.ones((3,4))*0,columns=['A','B','C','D'])
print(df1)
df2 = pd.DataFrame(np.ones((3,4))*1,columns=['A','B','C','D'])
print(df2)
df3 = pd.DataFrame(np.ones((3,4))*2,columns=['A','B','C','D'])
print(df3)
res = pd.concat([df1,df2,df3],axis=0,ignore_index=True) # 按行合并,ignore_index=True忽略到前端的行索引
print(res)
print('=='*30)
# join,['inner'.'outer'] 能将两个数据框不同的部分处理,相当于交并集
df4 = pd.DataFrame(np.ones((3,4))*0,columns=['A','B','C','D'],index=[1,2,3])
print(df4)
df5 = pd.DataFrame(np.ones((3,4))*1,columns=['A','B','C1','D1'],index=[2,3,4])
print(df5)
res1 = pd.concat([df4,df5],join='outer')# 默认的是outer模式
print(res1)
res1 = pd.concat([df4,df5],join='inner',ignore_index=True)# 只把相同的合并在一起
print(res1)
# join_axes
print('=='*30)
df4 = pd.DataFrame(np.ones((3,4))*0,columns=['A','B','C','D'],index=[1,2,3])
print(df4)
df5 = pd.DataFrame(np.ones((3,4))*1,columns=['A','B','C1','D1'],index=[2,3,4])
print(df5)
#res2 = pd.concat([df4,df5],axis=1,join_axes=[df1.index])
res2 = pd.concat([df4,df5.reindex_like(df4)],axis=1) #join_axes被移除,用reindex_like()代替
print(res2)
# append
print('=='*30)
df4 = pd.DataFrame(np.ones((3,4))*0,columns=['A','B','C','D'])
print(df4)
df5 = pd.DataFrame(np.ones((3,4))*1,columns=['A','B','C','D'])
print(df5)
df6 = pd.DataFrame(np.ones((3,4))*1,columns=['B','C','D','E'],index=[2,3,4])
res3 = df4.append(df5,ignore_index=True)
print(res3)
res4 = df4.append([df5,df6])
print(res4)
s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])
res5=df4.append(s1,ignore_index=True) # 每次只加一条
print(res5)
● pandas合并merge
import pandas as pd
import numpy as np
# merging two df by key/keys.(may be uesed in database)
# simple example
left = pd.DataFrame({'key':['K0','K1','K2','K3'],'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})
right=pd.DataFrame({'key':['K0','K1','K2','K3'],'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})
print(left)
print(right)
res = pd.merge(left,right,on='key') # 按照key的列合并
print(res)
print('=='*30)
left=pd.DataFrame({'key1':['K0','K0','K1','K2'],'key2':['K0','K1','K0','K1'],'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})
right = pd.DataFrame({'key1':['K0','K1','K1','K2'],'key2':['K0','K0','K0','K0'],'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})
print(left)
print(right)
res1=pd.merge(left,right,on=['key1','key2'],how='inner') # 两列关键词相同合并
print(res1)
res2=pd.merge(left,right,on=['key1','key2'],how='outer') # 两列关键词全部合并
print(res2)
res3=pd.merge(left,right,on=['key1','key2'],how='right') # 两列关键词基于right合并
print(res3)
print('=='*30)
df1=pd.DataFrame({'col1':[0,1],'col_left':['a','b']})
df2 = pd.DataFrame({'col1':[0,2,2],'col_right':[2,2,2]})
print(df1)
print(df2)
res1=pd.merge(df1,df2,on='col1',how='outer',indicator=True) # give the indicator a cunstom name,显示合并方式
res2 = pd.merge(df1,df2,on = 'col1',how = 'outer',indicator='incicator_column')# 修改显示合并方式的名字
print(res1)
print(res2)
print('=='*30)
left = pd.DataFrame({'A':['A0','A1','A2'],'B':['B0','B1','B2']},index=['K0','K1','K2'])
right = pd.DataFrame({'C':['C0','C2','C3'],'B':['D0','B2','B3']},index=['K0','K2','K3'])
print(left)
print(right)
res1=pd.merge(left,right,left_index=True,right_index=True,how='outer')
res2=pd.merge(left,right,left_index=True,right_index=True,how = 'inner')
print(res1)
print(res2)
print('=='*30)
boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})
girls = pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]})
print(boys)
print(girls)
res1 = pd.merge(boys,girls,on='k',suffixes=['_boy','_gilr'],how='inner')
#res2 = pd.merge(left,right,left_index=True,right_index=True,how = 'inner')
print(res1)
#print(res2)
● pandas plot画图
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# plot data
# Series
data = pd.Series(np.random.randn(1000),index=np.arange(1000))
data = data.cumsum()
data.plot()
plt.show()
# DataFrame
data=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list('ABCD'))
print(data.head())
data = data.cumsum()
data.plot()
plt.show()
#plot methods:'bar','hist','ked','area','scatter','hexbin','pie'
data=pd.DataFrame(np.random.randn(1000,4),index=np.arange(1000),columns=list('ABCD'))
print(data.head())
data = data.cumsum()
ax = data.plot.scatter(x='A',y='B',color='DarkBlue',label='Class 1')
data.plot.scatter(x='A',y='C',color='DarkGreen',label='Class 2',ax=ax)
# ax=ax,目的是为了一张图上两个,x和y相当于横轴和纵轴
plt.show()