# -*- coding: utf-8 -*-
"""
Created on Fri Oct 25 16:41:03 2019
@author: weiping
"""
import pandas as pd
import numpy as np
df1 = pd.DataFrame({'name':['a','b','c','d','f','g'],
'key1':[1,1,1,2,2,2],
'date1':[3,34,5,6,4,8]})
df2 = pd.DataFrame({'name':['a','b','c','d','e','h'],
'key1':[1,1,1,3,3,3],
'date2':[33,44,23,45,66,77]})
df1,df2
'''
数据框之间的左右连接(类似SQL中的表关联)
'''
df = pd.merge(df1,df2,on = 'name',how = 'left')#不限制how的内容 默认 inner
df
df = pd.merge(df1,df2,left_on = 'name',right_on = 'name',how = 'right')
df
df = pd.merge(df1,df2,on = ['name','key1'],how = 'left') # 联合字段关联
df
'''
索引上的连接
'''
df1 = pd.DataFrame({"key":['q','w','e','r','q','e','e'],
"data": range(7)})
df2 = pd.DataFrame({"da":[3,6,7]},index = ['a','q','e'])
df1,df2
pd.merge(df1,df2,left_on = 'key',right_index =True,how = 'right')
'''
轴向连接(pd.concat)
'''
#数组的轴向连接
arr = np.arange(12).reshape(3,4)
arr1 = np.arange(16).reshape(4,4)
np.concatenate([arr,arr],axis = 1 ) # 按列拼接
np.concatenate([arr,arr1],axis = 1)#报错 数组拼接 必须轴向数据条数一致
#数据框的轴向连接
df1 = pd.DataFrame(arr,columns = ['a','b','c','d'])
pd.concat([df1,df1]) # 默认axis = 0 按行连接
pd.concat([df1,df1],axis = 1 ) # 按列连接
'''
合并重叠数据(打补丁)
'''
a = pd.DataFrame(list(range(7)),columns=['c1'])
a.ix[2:3] = np.nan
a.ix[4:5] = np.nan
b = pd.DataFrame(list(range(4,11)),columns = ['c2'])
#np.where
c = pd.DataFrame(np.where(pd.isnull(a),b,a),columns = ['c3']) # np.where 类似 if else 返回数组
#combine_first
a['c1'].combine_first(b['c2']) # 与np.where 效果一致 返回 数据框
'''
长表 转化 宽表
'''
df = pd.DataFrame({'t1':[11,11,11,22,22,22,33,33,33],
'item':['a','b','c','a','b','c','a','b','c'],
'value':[3.4,3,4,5,6,7,8,8,9]})
df
'''
Out[59]:
t1 item value
0 11 a 3.4
1 11 b 3.0
2 11 c 4.0
3 22 a 5.0
4 22 b 6.0
5 22 c 7.0
6 33 a 8.0
7 33 b 8.0
8 33 c 9.0
'''
df2 = df.pivot('item','t1')
df2
'''
Out[61]:
value
t1 11 22 33
item
a 3.4 5.0 8.0
b 3.0 6.0 8.0
c 4.0 7.0 9.0
'''
df2 = df.pivot('t1','item')
df2
'''
Out[62]:
value
item a b c
t1
11 3.4 3.0 4.0
22 5.0 6.0 7.0
33 8.0 8.0 9.0
'''
# 宽表
test = pd.DataFrame(fake_data, columns=['subject', 'A', 'B', 'C'])
test
'''
subject A B C
0 math 88 70 60
1 english 90 80 78
'''
# 转换为长表
pd.melt(test, id_vars=['subject'])
'''
subject variable value
0 math A 88
1 english A 90
2 math B 70
3 english B 80
4 math C 60
5 english C 78
'''