pydata-data wrangling

## 等级索引
import pandas as pd
import numpy as np
data = pd.Series(np.random.randn(9),index=[['a', 'a', 'a', 'b', 'b', 'c', 'c', 'd', 'd'], [1, 2, 3, 1, 3, 1, 2, 2, 3]])
data
a 1 1.831129 2 -0.339180 3 -1.048893 b 1 0.146483 3 -0.165387 c 1 1.173373 2 -2.067447 d 2 0.571641 3 0.249290 dtype: float64
data.index #显示索引
MultiIndex(levels=[[‘a’, ‘b’, ‘c’, ‘d’], [1, 2, 3]], labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3], [0, 1, 2, 0, 2, 0, 1, 1, 2]]) ### 选择
data.b
1 0.146483 3 -0.165387 dtype: float64
data['b':'c'] #多个
b 1 0.146483 3 -0.165387 c 1 1.173373 2 -2.067447 dtype: float64
data[['b','d']] #特定位置
b 1 0.146483 3 -0.165387 d 2 0.571641 3 0.249290 dtype: float64
data.loc[:,2] #内层
a -0.339180 c -2.067447 d 0.571641 dtype: float64 ### 聚合
data
a 1 1.831129 2 -0.339180 3 -1.048893 b 1 0.146483 3 -0.165387 c 1 1.173373 2 -2.067447 d 2 0.571641 3 0.249290 dtype: float64
data.unstack()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
123
a1.831129-0.339180-1.048893
b0.146483NaN-0.165387
c1.173373-2.067447NaN
dNaN0.5716410.249290
data.unstack().stack() #取反
a 1 1.831129 2 -0.339180 3 -1.048893 b 1 0.146483 3 -0.165387 c 1 1.173373 2 -2.067447 d 2 0.571641 3 0.249290 dtype: float64 ### dataframe对象
frame = pd.DataFrame(np.arange(12).reshape((4, 3)),
 index=[['a', 'a', 'b', 'b'], [1, 2, 1, 2]],
 columns=[['Ohio', 'Ohio', 'Colorado'],
 ['Green', 'Red', 'Green']])
frame
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; }
OhioColorado
GreenRedGreen
a1012
2345
b1678
291011
#### 行列名
frame.index.names = ['key1','key2']
frame.columns.names = ['state','color']
frame
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
stateOhioColorado
colorGreenRedGreen
key1key2
a1012
2345
b1678
291011
#### 列选取
frame.Ohio
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
colorGreenRed
key1key2
a101
234
b167
2910
## 记录和排序
frame.swaplevel('key1','key2') #改变显示顺序
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
stateOhioColorado
colorGreenRedGreen
key2key1
1a012
2a345
1b678
2b91011
### 行索引排序
frame.sort_index(level=1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
stateOhioColorado
colorGreenRedGreen
key1key2
a1012
b1678
a2345
b291011
frame.swaplevel(0,1).sort_index(level=0) #统合操作
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
stateOhioColorado
colorGreenRedGreen
key2key1
1a012
b678
2a345
b91011
### 运算
frame.sum(level='key2') #按指定运算
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
stateOhioColorado
colorGreenRedGreen
key2
16810
2121416
frame.sum(level='color',axis=1) #按指定索引和列
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
colorGreenRed
key1key2
a121
284
b1147
22010
## 列索引
frame = pd.DataFrame({'a': range(7), 'b': range(7, 0, -1),
 'c': ['one', 'one', 'one', 'two', 'two',
'two', 'two'],
 'd': [0, 1, 2, 0, 1, 2, 3]})
frame
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
abcd
007one0
116one1
225one2
334two0
443two1
552two2
661two3
#### 变列为行索引
frame2 = frame.set_index(['c','d'])
frame2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
ab
cd
one007
116
225
two034
143
252
361
frame.set_index(['c','d'],drop=False) #保留
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
abcd
cd
one007one0
116one1
225one2
two034two0
143two1
252two2
361two3
frame2.reset_index() #行变列
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
cdab
0one007
1one116
2one225
3two034
4two143
5two252
6two361
## 数据集操作 ### 合并
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'], 'data1': range(7)})
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1key
00b
11b
22a
33c
44a
55a
66b
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],'data2': range(3)})
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data2key
00a
11b
22d
pd.merge(df1,df2)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1keydata2
00b1
11b1
26b1
32a0
44a0
55a0
#### 指定列
pd.merge(df1,df2,on='key')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1keydata2
00b1
11b1
26b1
32a0
44a0
55a0
#### 分别指定
df3 = pd.DataFrame({'lkey': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],'data2': range(3)})
pd.merge(df3,df4,left_on='lkey',right_on = 'rkey')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1lkeydata2rkey
00b1b
11b1b
26b1b
32a0a
44a0a
55a0a
#### 外部模式
pd.merge(df1,df2,how='outer')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1keydata2
00.0b1.0
11.0b1.0
26.0b1.0
32.0a0.0
44.0a0.0
55.0a0.0
63.0cNaN
7NaNd2.0

参数:
Option Behavior
‘inner’ Use only the key combinations observed in both tables
‘left’ Use all key combinations found in the left table
‘right’ Use all key combinations found in the right table
‘output’ Use all key combinations observed in both tables together

#### 全部参数展示
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'], 'data2': range(5)})
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1key
00b
11b
22a
33c
44a
55b
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data2key
00a
11b
22a
33b
44d
pd.merge(df1,df2,on='key',how='left')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1keydata2
00b1.0
10b3.0
21b1.0
31b3.0
42a0.0
52a2.0
63cNaN
74a0.0
84a2.0
95b1.0
105b3.0
pd.merge(df1,df2,how='inner')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
data1keydata2
00b1
10b3
21b1
31b3
45b1
55b3
62a0
72a2
84a0
94a2
#### 多列
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],'key2': ['one', 'two', 'one'],'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],'key2': ['one', 'one', 'one', 'two'],'rval': [4, 5, 6, 7]})
pd.merge(left,right,on=['key1','key2',],how='outer')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key1key2lvalrval
0fooone1.04.0
1fooone1.05.0
2footwo2.0NaN
3barone3.06.0
4bartwoNaN7.0
pd.merge(left,right,on='key1')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key1key2_xlvalkey2_yrval
0fooone1one4
1fooone1one5
2footwo2one4
3footwo2one5
4barone3one6
5barone3two7
left
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key1key2lval
0fooone1
1footwo2
2barone3
right
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key1key2rval
0fooone4
1fooone5
2barone6
3bartwo7
pd.merge(left,right,on='key1',suffixes = ('_left','_right'))
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
key1key2_leftlvalkey2_rightrval
0fooone1one4
1fooone1one5
2footwo2one4
3footwo2one5
4barone3one6
5barone3two7

参数:
Argument Description
left DataFrame to be merged on the left side.
right DataFrame to be merged on the right side.
how One of ‘inner’, ‘outer’, ‘left’, or ‘right’; defaults to ‘inner’.
on Column names to join on. Must be found in both DataFrame objects. If not specified and no other join keys given, will use the intersection of the column names in left and right as the join keys.
left_on Columns in left DataFrame to use as join keys.
right_on Analogous to left_on for left DataFrame.
left_index Use row index in left as its join key (or keys, if a MultiIndex).
right_index Analogous to left_index.
sort Sort merged data lexicographically by join keys; True by default (disable to get better performance in some cases on large datasets).
suffixes Tuple of string values to append to column names in case of overlap; defaults to (‘_x’, ‘_y’) (e.g., if ‘data’ in both DataFrame objects, would appear as ‘data_x’ and ‘data_y’ in result).
copy If False, avoid copying data into resulting data structure in some exceptional cases; by default always copies.
indicator
Adds a special column _merge that indicates the source of each row; values will be ‘left_only’, ‘right_only’, or ‘both’ based on the origin of the joined data in each row.

### 索引联结
 left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],'value': range(6)})
 right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
left1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
keyvalue
0a0
1b1
2a2
3a3
4b4
5c5
right1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
group_val
a3.5
b7.0
pd.merge(left1,right1,left_on = 'key',right_index = True)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0
pd.merge(left1,right1,left_on = 'key',right_index = True,how= 'outer')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
keyvaluegroup_val
0a03.5
2a23.5
3a33.5
1b17.0
4b47.0
5c5NaN
#### 加之嵌套索引
lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],'key2': [2000, 2001, 2002, 2001, 2002], 'data': np.arange(5.)})
lefth
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
datakey1key2
00.0Ohio2000
11.0Ohio2001
22.0Ohio2002
33.0Nevada2001
44.0Nevada2002
righth = pd.DataFrame(np.arange(12).reshape((6, 2)),index=[['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],[2001, 2000, 2000, 2000, 2001, 2002]],columns=['event1', 'event2'])
righth
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
event1event2
Nevada200101
200023
Ohio200045
200067
200189
20021011
pd.merge(lefth,righth,left_on = ['key1','key2'],right_index = True)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
datakey1key2event1event2
00.0Ohio200045
00.0Ohio200067
11.0Ohio200189
22.0Ohio20021011
33.0Nevada200101
pd.merge(lefth,righth,left_on = ['key1','key2'],right_index = True,how='outer')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
datakey1key2event1event2
00.0Ohio20004.05.0
00.0Ohio20006.07.0
11.0Ohio20018.09.0
22.0Ohio200210.011.0
33.0Nevada20010.01.0
44.0Nevada2002NaNNaN
4NaNNevada20002.03.0
### 保留左右
 left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
   ....:                      index=['a', 'c', 'e'],
   ....:                      columns=['Ohio', 'Nevada'])

right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]],
....:                       index=['b', 'c', 'd', 'e'],
   ....:                       columns=['Missouri', 'Alabama'])
left2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
OhioNevada
a1.02.0
c3.04.0
e5.06.0
right2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
MissouriAlabama
b7.08.0
c9.010.0
d11.012.0
e13.014.0
pd.merge(left2,right2,how='outer',left_index = True,right_index= True)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
OhioNevadaMissouriAlabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0
left2.join(right2,how='outer') #等价写法
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
OhioNevadaMissouriAlabama
a1.02.0NaNNaN
bNaNNaN7.08.0
c3.04.09.010.0
dNaNNaN11.012.0
e5.06.013.014.0
left1.join(right1,on='key')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
keyvaluegroup_val
0a03.5
1b17.0
2a23.5
3a33.5
4b47.0
5c5NaN
### 其他
another = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [16., 17.]], index=['a', 'c', 'e', 'f'],columns=['New York', 'Oregon'])
another
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
New YorkOregon
a7.08.0
c9.010.0
e11.012.0
f16.017.0
left2.join([right2,another])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
OhioNevadaMissouriAlabamaNew YorkOregon
a1.02.0NaNNaN7.08.0
c3.04.09.010.09.010.0
e5.06.013.014.011.012.0
left2.join([right2,another],how='outer')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
OhioNevadaMissouriAlabamaNew YorkOregon
a1.02.0NaNNaN7.08.0
bNaNNaN7.08.0NaNNaN
c3.04.09.010.09.010.0
dNaNNaN11.012.0NaNNaN
e5.06.013.014.011.012.0
fNaNNaNNaNNaN16.017.0
### 用轴联结
arr = np.arange(12).reshape((3,4))
arr
array([[ 0, 1, 2, 3], [ 4, 5, 6, 7], [ 8, 9, 10, 11]])
np.concatenate([arr,arr],axis=1)
array([[ 0, 1, 2, 3, 0, 1, 2, 3], [ 4, 5, 6, 7, 4, 5, 6, 7], [ 8, 9, 10, 11, 8, 9, 10, 11]]) #### concat
s1 = pd.Series([0, 1], index=['a', 'b'])

s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])

s3 = pd.Series([5, 6], index=['f', 'g'])
pd.concat([s1,s2,s3])
a 0 b 1 c 2 d 3 e 4 f 5 g 6 dtype: int64
pd.concat([s1,s2,s3],axis=1) #列
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
012
a0.0NaNNaN
b1.0NaNNaN
cNaN2.0NaN
dNaN3.0NaN
eNaN4.0NaN
fNaNNaN5.0
gNaNNaN6.0
#### 内联结
s4 = pd.concat([s1,s3])
s4
a 0 b 1 f 5 g 6 dtype: int64
pd.concat([s1,s4],axis=1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
01
a0.00
b1.01
fNaN5
gNaN6
pd.concat([s1,s4],axis=1,join='inner')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
01
a00
b11
#### 指定行
pd.concat([s1,s4],axis=1,join_axes=[['a','c','b','e']])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
01
a0.00.0
cNaNNaN
b1.01.0
eNaNNaN
#### 同时产生嵌套索引
result= pd.concat([s1,s1,s3],keys = ['one','two','three'])
result
one a 0 b 1 two a 0 b 1 three f 5 g 6 dtype: int64
result.unstack() #堆叠
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
abfg
one0.01.0NaNNaN
two0.01.0NaNNaN
threeNaNNaN5.06.0
#### 列
 pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
onetwothree
a0.0NaNNaN
b1.0NaNNaN
cNaN2.0NaN
dNaN3.0NaN
eNaN4.0NaN
fNaNNaN5.0
gNaNNaN6.0
### dataframe对象
df1 = pd.DataFrame(np.arange(6).reshape(3, 2), index=['a', 'b', 'c'], columns=['one', 'two'])

df2 = pd.DataFrame(5 + np.arange(4).reshape(2, 2), index=['a', 'c'], columns=['three', 'four'])
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
onetwo
a01
b23
c45
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
threefour
a56
c78
pd.concat([df1,df2],axis=1,keys = ['level1','level2'])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; }
level1level2
onetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
#### dict写法
pd.concat({'level1':df1,'level2':df2},axis=1)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; }
level1level2
onetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
#### 命名
pd.concat([df1,df2],axis=1,keys=['level1','level2'],names = ['upper','lower'])
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; }
upperlevel1level2
loweronetwothreefour
a015.06.0
b23NaNNaN
c457.08.0
### 不含任何相同索引
df1 = pd.DataFrame(np.random.randn(3, 4), columns=['a', 'b', 'c', 'd'])

df2 = pd.DataFrame(np.random.randn(2, 3), columns=['b', 'd', 'a'])
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
abcd
00.636284-0.256463-0.2977012.199012
10.154697-1.487730-0.0090240.268773
21.0724181.6638950.114226-0.298354
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
bda
0-0.599669-0.6713641.454462
1-0.132970-2.897728-2.880973
pd.concat([df1,df2],ignore_index=True) #强行
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
abcd
00.636284-0.256463-0.2977012.199012
10.154697-1.487730-0.0090240.268773
21.0724181.6638950.114226-0.298354
31.454462-0.599669NaN-0.671364
4-2.880973-0.132970NaN-2.897728

concat参数:
Argument Description
objs List or dict of pandas objects to be concatenated; this is the only required argument
axis Axis to concatenate along; defaults to 0 (along rows)
join Either ‘inner’ or ‘outer’ (‘outer’ by default); whether to intersection (inner) or union (outer) together indexes along the other axes
join_axes Specific indexes to use for the other n–1 axes instead of performing union/intersection logic
keys Values to associate with objects being concatenated, forming a hierarchical index along the concatenation axis; can either be a list or array of arbitrary values, an array of tuples, or a list of arrays (if multiple-level arrays passed in levels)
levels Specific indexes to use as hierarchical index level or levels if keys passed
names Names for created hierarchical levels if keys and/or levels passed
verify_integrity Check new axis in concatenated object for duplicates and raise exception if so; by default (False) allows duplicates
ignore_index Do not preserve indexes along concatenation axis, instead producing a new range(total_length) index

### 含重复项
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan], index=['f', 'e', 'd', 'c', 'b', 'a'])

b = pd.Series(np.arange(len(a), dtype=np.float64), index=['f', 'e', 'd', 'c', 'b', 'a'])
b[-1] = np.nan
a
f NaN e 2.5 d NaN c 3.5 b 4.5 a NaN dtype: float64
b
f 0.0 e 1.0 d 2.0 c 3.0 b 4.0 a NaN dtype: float64 ### where
np.where(pd.isnull(a),b,a)
array([0. , 2.5, 2. , 3.5, 4.5, nan]) ### 不知道
b[:-2].combine_first(a[2:])
a NaN b 4.5 c 3.0 d 2.0 e 1.0 f 0.0 dtype: float64 #### dataframe
df1 = pd.DataFrame({'a': [1., np.nan, 5., np.nan],
   .....:                     'b': [np.nan, 2., np.nan, 6.],
   .....:                     'c': range(2, 18, 4)})

df2 = pd.DataFrame({'a': [5., 4., np.nan, 3., 7.],
   .....:                     'b': [np.nan, 3., 4., 6., 8.]})
df1
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
abc
01.0NaN2
1NaN2.06
25.0NaN10
3NaN6.014
df2
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
ab
05.0NaN
14.03.0
2NaN4.0
33.06.0
47.08.0
df1.combine_first(df2)
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
abc
01.0NaN2.0
14.02.06.0
25.04.010.0
33.06.014.0
47.08.0NaN
## 变形记 ### 嵌套索引变形 ### 列to行
data = pd.DataFrame(np.arange(6).reshape((2, 3)),index=pd.Index(['Ohio', 'Colorado'], name='state'),columns=pd.Index(['one', 'two', 'three'],name='number'))
data
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
numberonetwothree
state
Ohio012
Colorado345
result = data.stack() 
result
state number Ohio one 0 two 1 three 2 Colorado one 3 two 4 three 5 dtype: int64 #### 行to列
result.unstack()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
numberonetwothree
state
Ohio012
Colorado345
result.unstack(0) #更改默认方向
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
stateOhioColorado
number
one03
two14
three25
result.unstack('state') #等价写法
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
stateOhioColorado
number
one03
two14
three25
##### 找不到则为NA
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])

s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])

data2 = pd.concat([s1, s2], keys=['one', 'two'])
data2.unstack()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
abcde
one0.01.02.03.0NaN
twoNaNNaN4.05.06.0
data2.unstack().stack()
one a 0.0 b 1.0 c 2.0 d 3.0 two c 4.0 d 5.0 e 6.0 dtype: float64
data2.unstack().stack(dropna=False) #不丢
one a 0.0 b 1.0 c 2.0 d 3.0 e NaN two a NaN b NaN c 4.0 d 5.0 e 6.0 dtype: float64 #### dataframe
df = pd.DataFrame({'left': result, 'right': result + 5}, columns=pd.Index(['left', 'right'], name='side'))
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
sideleftright
statenumber
Ohioone05
two16
three27
Coloradoone38
two49
three510
df.unstack('state')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead tr th { text-align: left; } .dataframe thead tr:last-of-type th { text-align: right; }
sideleftright
stateOhioColoradoOhioColorado
number
one0358
two1469
three25710
#### 同时指定
df.unstack('state').stack('side')
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
stateColoradoOhio
numberside
oneleft30
right85
twoleft41
right96
threeleft52
right107
### 长变宽
data = pd.read_csv('/Users/meininghang/Downloads/pydata-book-2nd-edition/examples/macrodata.csv')
data.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
yearquarterrealgdprealconsrealinvrealgovtrealdpicpim1tbilrateunemppopinflrealint
01959.01.02710.3491707.4286.898470.0451886.928.98139.72.825.8177.1460.000.00
11959.02.02778.8011733.7310.859481.3011919.729.15141.73.085.1177.8302.340.74
21959.03.02775.4881751.8289.226491.2601916.429.35140.53.825.3178.6572.741.09
31959.04.02785.2041753.7299.356484.0521931.329.37140.04.335.6179.3860.274.06
41960.01.02847.6991770.5331.722462.1991955.529.54139.63.505.2180.0072.311.19
periods = pd.PeriodIndex(year=data.year,quarter=data.quarter,name='date')
columns = pd.Index(['realgdp','infl','unemp'],name='item')
data = data.reindex(columns=columns)
data.index = periods.to_timestamp('D','end')
ldata = data.stack().reset_index().rename(columns={0:'value'})
ldata.head()
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
dateitemvalue
01959-03-31realgdp2710.349
11959-03-31infl0.000
21959-03-31unemp5.800
31959-06-30realgdp2778.801
41959-06-30infl2.340
### 宽变长
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
   .....:                    'A': [1, 2, 3],
   .....:                    'B': [4, 5, 6],
   .....:                    'C': [7, 8, 9]})
df
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
ABCkey
0147foo
1258bar
2369baz
melted = pd.melt(df,['key'])
melted
.dataframe tbody tr th:only-of-type { vertical-align: middle; } .dataframe tbody tr th { vertical-align: top; } .dataframe thead th { text-align: right; }
keyvariablevalue
0fooA1
1barA2
2bazA3
3fooB4
4barB5
5bazB6
6fooC7
7barC8
8bazC9
reshaped = melted.pivot('key','variable','value')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值