pandas数据合并

数据合并 pd.merge() .join()

from __future__ import division
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from scipy.interpolate import lagrange #导入拉格朗日插值函数
from pandas import Series,DataFrame

df1=DataFrame({'key':['b','b','a','c','a','a','b'],
             'data1':range(7)})
df2=DataFrame({'key':['a','b','d'],
              'data2':range(3)})
pd.merge(df1,df2,on='key') #pd.merge()两表用来对齐的一列必须具有相同的列名,通过指定健为key

df3=DataFrame({'Ikey':['b','b','a','c','a','a','b'],
              'data1':range(7)})
df4=DataFrame({'rkey':['a','b','d'],
              'data2':range(3)})
pd.merge(df3,df4,left_on='Ikey',right_on='rkey')

pd.merge(df1,df2,how='outer')#how指定连接方式,默认inner

pd.merge(df1,df2,how='inner')

pd.merge(df1,df2,how='left')

left=DataFrame({'key1':['foo','foo','bar'],
                'key2':['one','two','one'],
               'lval':[1,2,3]})
right=DataFrame({'key1':['foo','foo','bar','bar'],
                'key2':['one','one','one','two'],
                'lval':[4,5,6,7]})
pd.merge(left,right,on=['key1','key2'],how='outer')

pd.merge(left,right,on='key1',suffixes=('_left','_right'))

left1=DataFrame({'key':['a','b','a','a','b','c'],
                'value':range(6)})
right2=DataFrame({'group_val':[3.5,7]},index=['a','b'])
pd.merge(left1,right2,left_on='key',right_index=True)

pd.merge(left1,right2,left_on='key',right_index=True,how='outer')

lefth=DataFrame({'key1':['Ohio','Ohio','Ohio','Nevada','Nevada'],
                'key2':[2000,2001,2002,2001,2002],
                'data':np.arange(5.)},columns=['key1','key2','data'])
righth=DataFrame(np.arange(12).reshape((6,2)),index=[['Nevada','Nevada','Ohio','Ohio','Ohio','Ohio'],[2001,2000,2000,2000,2001,2002]],columns=['event1','event2'])
righth

lefth

pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True)

pd.merge(lefth,righth,left_on=['key1','key2'],right_index=True,how='outer')

left2=DataFrame([[1.,2.],[3.,4.],[5.,6.]],index=['a','c','e'],columns=['Ohio','Nevada'])
right2=DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14.]],index=['b','c','d','e'],columns=['Missouri','Alabama'])
pd.merge(left2,right2,how='outer',left_index=True,right_index=True)

left2.join(right2,how='inner')#dataframe内置的join方法是一种快速合并的方法。它默认以index作为对齐的列。

another=DataFrame([[7.,8.],[9.,10.],[11.,12.],[13.,14.]],index=['a','c','e','f'],columns=['New York','Oregon'])
left2.join([right2,another])

left2.join([right2,another],how='outer')

 轴向连接 pd.concatenate(),pd.concat()

轴向连接

arr=np.arange(12).reshape((3,4))
arr

np.concatenate([arr,arr],axis=1)

np.concatenate([arr,arr])

s1=Series([0,1],index=['a','b'])
s2=Series([2,3,4],index=['c','d','e'])
s3=Series([5,6],index=['f','g'])
pd.concat([s1,s2,s3])

pd.concat([s1,s2,s3],axis=1)

s4=pd.concat([s1*5,s3])
s4

pd.concat([s1,pd.concat([s1*5,s3])],axis=1)

pd.concat([s1,s4],axis=1,join='inner')

pd.concat([s1,s4],join='inner')

pd.concat([s1,s4],axis=1,join_axes=[['a','c','b','e']])

result=pd.concat([s1,s2,s3],keys=['one','two','three'])
result

result.unstack()#转2维

pd.concat([s1,s2,s3],axis=1,keys=['one','two','three'])

df1=DataFrame(np.arange(6).reshape(3,2),index=['a','b','c'],columns=['one','two'])
df2=DataFrame(5+np.arange(4).reshape(2,2),index=['a','c'],columns=['three','four'])
pd.concat([df1,df2],axis=1,keys=['level1','level2'])

pd.concat({'level1':df1,'level2':df2},axis=1)

pd.concat([df1,df2],axis=1,keys=['level1','level2'],names=['upper','lower'])

df1=DataFrame(np.random.randn(3,4),columns=['a','b','c','d'])
df1

df2=DataFrame(np.random.randn(2,3),columns=['b','d','a'])
df2

pd.concat([df1,df2])

合并重叠数据 .combine_first()

合并重叠数据

a=Series([np.nan,2.5,np.nan,3.5,4.5,np.nan],index=['f','e','d','c','b','a'])
b=Series(np.arange(len(a),dtype=np.float64),index=['f','e','d','c','b','a'])
b[-1]=np.nan
print(a,b)

np.where(pd.isnull(a),b,a) #用b值填补a的缺失值

a.combine_first(b) #用b的数据填补a的缺失值

b[:-2]

a[2:]

b[:-2].combine_first(a[2:]) #用a[2:]的数据填补b[:-2]的缺失值

df1=DataFrame({'a':[1.,np.nan,5.,np.nan],
              'b':[np.nan,2.,np.nan,6.],
              'c':range(2,18,4)})
df1

df2=DataFrame({'a':[5.,4.,np.nan,3.,7.],'b':[np.nan,3.,4.,6.,8.]})
df2

df1.combine_first(df2)

重塑层次化索引 data.unstack() data

重塑层次化索引

data=DataFrame(np.arange(6).reshape((2,3)),index=pd.Index(['Ohio','Colorado'],name='state'),columns=pd.Index(['one','two','three'],name='number'))
data

result=data.stack()
result

result.unstack()

result.unstack(0)

result.unstack('state')

s1 = Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
data2

data2.unstack()

data2.unstack().stack()

data2.unstack().stack(dropna=False)

df=DataFrame({'left':result,'right':result+5},columns=pd.Index(['left','right'],name='side'))
df

df.unstack('state')

df.unstack('state').stack('side')

长宽格式的转换 data.pivot()透视表

长宽格式的转换

data=pd.read_csv('E:/python/wangyiPython/the sixth week/data/macrodata.csv')
data

periods=pd.PeriodIndex(year=data.year,quarter=data.quarter,name='date')
periods

data=DataFrame(data.to_records(),columns=pd.Index(['realgdp','infl','unemp'],name='item'),index=periods.to_timestamp('D','end'))
data

idata=data.stack().reset_index().rename(columns={0:'value'})#.reset_index()重置索引 .rename(columns={0:'value'})将新列命名为value
idata

wdata=idata.pivot('date','item','value')#透视表,行,列,值
wdata

pivoted=idata.pivot('date','item','value')
pivoted.head(10)#查看前10行

idata['value2']=np.random.randn(len(idata))
idata[:10]

pivoted=idata.pivot('date','item')
pivoted[:10]

pivoted['value'][:10]

unstacked=idata.set_index(['date','item'])
unstacked

unstacked.unstack('item')

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值