Series
import pandas as pd
s1 = pd.Series([4,-7,-5,3]) #创建一个series,索引为默认值
print(s1)
print(s1.values) #series value
print(s1.index) #series index
print("********")
s2 = pd.Series([4.0,6.5,-0.5,4.2],index=['d','b','a','c'])
print(s2)
print(s2['a']) #根据索引取值
print(s2[['a','b','c','d']])
print("********")
#Series可以看作是一个定长的有序字典
dic1 = {'apple':5, 'pen':3, 'applepen':10}
s3 = pd.Series(dic1)
print(s3)
DataFrame
data = {'year':[2014,2015,2016,2017],
'income':[10000,30000,50000,80000],
'pay':[5000,20000,30000,3000]}
df1 = pd.DataFrame(data)
# 一些属性
print(df1.columns) #列
print("********")
print(df1.index) #行
print("********")
print(df1.values)
print("********")
print(df1.describe())
print("********")
print(df1.T)
import pandas as pd
import numpy as np
df2 = pd.DataFrame(np.arange(12).reshape((3,4)))
df3 = pd.DataFrame(np.arange(12).reshape((3,4)),index=['a','c','b'],columns=[2,33,44,5])
df4 = df3.sort_index(axis=1) #列排序
df5 = df3.sort_index(axis=0) #行排序
df6 = df3.sort_values(by=44) #对单独某一个列排序
筛选数据
import pandas as pd
import numpy as np
dates = pd.date_range('20170101',periods=5)
df1 = pd.DataFrame(np.arange(15).reshape((5,3)),index=dates,columns=['A','B','C'])
# 将DataFrame的列获取为一个Series
print(df1['A']) #或者写为: df1.A
print("********")
print(df1[0:2]) #取0-1行
print("********")
print(df1['20170102':'20170104'])
print("********")
#通过标签选择数据
print(df1.loc['20170102'])
print("********")
print(df1.loc['20170101',['A','C']])
print("********")
print(df1.loc[:,['A','B']])
print("********")
#通过位置选择数据
print(df1.iloc[2]) #第二行
print("********")
print(df1.iloc[1:3,2:3])
print("********")
print(df1.iloc[[1,2,4],[1,2]])
赋值及操作
import pandas as pd
import numpy as np
dates = np.arange(20170101,20170107)
df1 = pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df1.iloc[2,2] = 100
df1.loc[20170102,'B'] = 200
df1[df1.A>10] = 0 #快速找到A这一列大于10的行赋值为0
df1.A[df1.A==0] = 1
df1['E'] = 10 #添加一列
df1['F'] = pd.Series([1,2,3,4,5,6],index=dates)#添加一列
df1.loc[20170107,['A','B','C']] = [1,2,3] #添加一行
s1 = pd.Series([1,2,3,4,5,6],index=['A','B','C','D','E','F'])
s1.name = 'S1'
df2 = df1.append(s1)
df1.insert(1,'G',df2['E'])#在第1列插入索引为G的df2中的E列
g = df1.pop('G')#弹出G列
df1.insert(6,'G',g)#在最后插入
del df1['G']#删除G列
df2 = df1.drop(['A','B'],axis=1)#删除AB列 1代表列
df2 = df1.drop([20170101,20170102],axis=0)#删除20170101,20170102行 0代表行
处理空值
# axis=[0,1] 0代表行,1代表列。
# how=['any','all'] any任意一个或多个 all全部为空值
df2.dropna(axis=0,how='any') #判断哪一行有空值,删除
df2.dropna(axis=1,how='any') #判断哪一列有空值,删除
df2.fillna(value=0) #把空值赋值为0
df2.isnull() #查看空值
np.any(df2.isnull()) #只要有一个或多个空值就会返回true
np.all(df2.isnull()) #所有为空值才返回true
concat数据合并
df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','d'])
df2 = pd.DataFrame(np.arange(12,24).reshape((3,4)),columns=['a','b','c','d'])
df3 = pd.DataFrame(np.arange(24,36).reshape((3,4)),columns=['a','b','c','d'])
df4 = pd.concat([df1,df2,df3],axis=0) #纵向合并
df5 = pd.concat([df1,df2,df3],axis=0,ignore_index=True) #纵向合并,并且不考虑原来的index
df6 = pd.concat([df1,df2,df3],axis=1) #横向合并
外接 内接
df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','f'])
df2 = pd.DataFrame(np.arange(12,24).reshape((3,4)),columns=['a','c','d','e'])
#合并两个表,缺少的部分填充NaN
df3 = pd.concat([df1,df2],join='outer',ignore_index=True)
#合并两个表,缺少的部分去掉,即保留共有的部分
df4 = pd.concat([df1,df2],join='inner',ignore_index=True)
df1 = pd.DataFrame(np.arange(12).reshape((3,4)),columns=['a','b','c','f'])
df2 = pd.DataFrame(np.arange(12,24).reshape((4,3)),columns=['a','c','d'])
df3 = pd.concat([df1,df2],axis=1,join_axes=[df1.index]) #横向合并,index使用df1的index
df4 = pd.concat([df1,df2],axis=1) #横向合并
merge合并
df_left = pd.DataFrame({'key':['K0','K1','K2','K3'],
'A':['A0','A1','A2','A3'],
'B':['B0','B1','B2','B3']})
df_right = pd.DataFrame({'key':['K0','K1','K2','K3'],
'C':['C0','C1','C2','C3'],
'D':['D0','D1','D2','D3']})
df = pd.merge(df_left,df_right,on='key')
外连接 内连接 左连接 右连接
df1 = pd.DataFrame({'key1':['K0','K0','K1','K2'],'key2':['K0','K1','K0','K1'],
'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})
df2 = pd.DataFrame({'key1':['K0','K1','K1','K3'],'key2':['K0','K0','K0','K0'],
'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})
#how默认inner how = ['left','right','inner','outer']
df_outer = pd.merge(df1,df2,on=['key1','key2'],how='outer')
df_inner = pd.merge(df1,df2,on=['key1','key2'],how='inner')
df_left = pd.merge(df1,df2,on=['key1','key2'],how='left')
df_right = pd.merge(df1,df2,on=['key1','key2'],how='right')
#显示merge信息
df_outer1 = pd.merge(df1,df2,on=['key1','key2'],how='outer',indicator=True)
#显示merge信息,表头中名字为indicator_column
df_outer2 = pd.merge(df1,df2,on=['key1','key2'],how='outer',indicator='indicator_column')
df1 = pd.DataFrame({'A':['A0','A1','A2'],'B':['B0','B1','B2']},
index = ['K0','K1','K2'])
df2 = pd.DataFrame({'C':['C0','C2','C3'],'D':['D0','D2','D3']},
index=['K0','K2','K3'])
df = pd.merge(df1,df2,left_index=True,right_index=True,how='outer')
df_boys = pd.DataFrame({'k':['K0','K1','K2'],'age':[1,2,3]})
df_girls = pd.DataFrame({'k':['K0','K0','K3'],'age':[4,5,6]})
#区别左边右边的age名字的列
df_all = pd.merge(df_boys,df_girls,on='k',suffixes=['_boy','_girl'],how='outer')