Pandas数据分析之数据规整:连接、联合与重塑
文章目录
一、分层索引
一个轴允许拥有多个索引,通过分层索引对象,称为部分索引
data = pd.Series(np.random.randn(9),
index=[['a', 'a', 'a', 'b', 'b', 'c', 'c','d','d'],
[1, 2, 3, 1, 3, 1, 2, 2, 3]])
"""
a 1 1.096478
2 0.657470
3 -0.815734
b 1 0.913964
3 -0.810128
c 1 -0.356851
2 1.417345
d 2 0.348536
3 -0.556042
dtype: float64
"""
data.index #多个索引
"""
MultiIndex([('a', 1),
('a', 2),
('a', 3),
('b', 1),
('b', 3),
('c', 1),
('c', 2),
('d', 2),
('d', 3)],
)
"""
data['b']
data['b':'c']
data.loc[['b', 'd']]
data.loc[:, 2]
"""
a -0.468684
c 1.084225
d -1.298256
dtype: float64
"""
data.unstack() # 重新排列
"""
1 2 3
a 0.644685 1.830816 -0.675091
b 0.429473 NaN 0.641919
c -0.111824 -1.343299 NaN
d NaN 0.219104 -0.726813
"""
data.unstack().stack() #unstack反函数,还原
frame = pd.DataFrame(
np.arange(12).reshape((4, 3)),
index=[['a', 'a', 'b', 'b'],[1, 2, 1, 2]],
columns=[['Ohio', 'Ohio', 'Colorado'],
['Green', 'Red', 'Green']])
"""
Ohio Colorado
Green Red Green
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
"""
frame.index.names = ['key1', 'key2'] #设置索引名字
frame.columns.names = ['state', 'color']
"""
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
2 3 4 5
b 1 6 7 8
2 9 10 11
"""
#一个MultiIndex对象使用其自身的构造函数自定义
MultiIndex.from_arrays([['Ohio', 'Ohio', 'Colorado'],
['Green','Red', 'Green']],
names=['state', 'color'])
1、重排序和层级排序
swaplevel接受两个层级序号或层级名称,返回层级变更的新对象,数据不变
sort_index只能在单一层级
上对数据进行排序
frame.swaplevel('key1', 'key2') #交换index两个索引
"""
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
2 a 3 4 5
1 b 6 7 8
2 b 9 10 11
"""
frame.sort_index(level=1) # 对key2进行排序
"""
state Ohio Colorado
color Green Red Green
key1 key2
a 1 0 1 2
b 1 6 7 8
a 2 3 4 5
b 2 9 10 11
"""
#key1和key2交换后,按key2排序
frame.swaplevel(0, 1).sort_index(level=0)
"""
state Ohio Colorado
color Green Red Green
key2 key1
1 a 0 1 2
b 6 7 8
2 a 3 4 5
b 9 10 11
"""
2、按层级进行汇总统计
DataFrame和Series很多描述性和汇总性的函数都有level参数,可以指定轴进行聚合
frame.sum(level='key2') #可指定轴的层级
frame.sum(level='color', axis=1) #按列
3、使用DataFrame的列进行索引
frame = pd.DataFrame({
'a': range(7),
'b': range(7, 0, -1),
'c': ['one', 'one','one', 'two','two','two', 'two'],
'd': [0, 1, 2, 0, 1, 2, 3]})
"""
a b c d
0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
3 3 4 two 0
4 4 3 two 1
5 5 2 two 2
6 6 1 two 3
"""
frame2 = frame.set_index(['c', 'd'])
frame.set_index(['c', 'd'], drop=False) #保留c,d列
frame2.reset_index() #set_index的反函数,索引会移动到列中
"""
a b
c d
one 0 0 7
1 1 6
2 2 5
two 0 3 4
1 4 3
2 5 2
3 6 1
a b c d
c d
one 0 0 7 one 0
1 1 6 one 1
2 2 5 one 2
two 0 3 4 two 0
1 4 3 two 1
2 5 2 two 2
3 6 1 two 3
"""
二、联合与合并数据集
Pandas.merge根据一个或多个键将行
进行连接,它实现的就是数据库的 join操作
Pandas.concat对象在轴向上将多个对象堆叠到一起
combine_first允许将重叠数据拼接在一起,用一个对象中的值填充另一个对象中的缺失值。
1、数据库风格的DataFrame连接——merge
Pandas.merge默认内连接(inner join),两张表的交集,参数how=inner/left/right/outer指定为内连接/左连接/右连接/外连接
DataFrame进行列-列连接合并,时索引对象会被丢弃
参数 | 描述 |
---|---|
left | 合并操作时左边的DataFrame |
right | 合并操作时右边的DataFrame |
how | inner、left、right、outer,默认inner |
left_on | left DataFrame中作为连接键的列 |
right_on | right DataFrame中作为连接键的列 |
left_index | 使用left的行索引作为连接键 |
right_index | 使用right的行索引作为连接键 |
soft | 通过连接键按字母顺序对合并的数据进行排序,默认为True,处理大数据时可为False |
suffixes | 在列名重叠时,添加到列名后的字符串元组,默认(‘_x’, ‘_y’),若含有data列,则为(‘data_x’, data’_y’) |
copy | 默认为True,False会避免将数据复制到数据结构中 |
indicator | 添加特殊列_merge,指示每行来源:left_only 、right_only、both |
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1': range(7)})
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],'data2': range(3)})
pd.merge(df1, df2) #默认重叠的列名作为连接键
pd.merge(df1, df2, on='key') # 指定连接键
pd.merge(df3, df4, left_on='lkey', right_on='rkey') #列名不同指定列名
pd.merge(df1, df2, how='outer') #外连接,并集
pd.merge(df1, df2, on='key', how='left') #多对多连接是行的笛卡尔积
pd.merge(left, right, on=['key1', 'key2'], how='outer') #多键合并,看作一个元组作为单个连接键
pd.merge(left, right, on='key1', suffixes=('_left', '_right')) #合并后列名重叠时可指定列名
2、根据索引合并——join&merge
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b','c'],
'value': range(6)})
"""
key value
0 a 0
1 b 1
2 a 2
3 a 3
4 b 4
5 c 5
"""
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
"""
group_val
a 3.5
b 7.0
"""
#可以与how='outer'结合使用
pd.merge(left1, right1, left_on='key', right_index=True)
"""
key value group_val
0 a 0 3.5
2 a 2 3.5
3 a 3 3.5
1 b 1 7.0
4 b 4 7.0
"""
lefth = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio','Nevada', 'Nevada'],
'key2': [2000, 2001, 2002, 2001,2002],
'data': np.arange(5.)})
"""
key1 key2 data
0 Ohio 2000 0.0
1 Ohio 2001 1.0
2 Ohio 2002 2.0
3 Nevada 2001 3.0
4 Nevada 2002 4.0
"""
righth = pd.DataFrame(
np.arange(12).reshape((6, 2)),
index=[['Nevada', 'Nevada', 'Ohio', 'Ohio','Ohio', 'Ohio'],
[2001, 2000, 2000, 2000, 2001, 2002]],
columns=['event1', 'event2'])
"""
event1 event2
Nevada 2001 0 1
2000 2 3
Ohio 2000 4 5
2000 6 7
2001 8 9
2002 10 11
"""
pd.merge(lefth, righth, left_on=['key1', 'key2'],
right_index=True, how='outer') #复杂索引使用how='outer'
"""
key1 key2 data event1 event2
0 Ohio 2000 0.0 4.0 5.0
0 Ohio 2000 0.0 6.0 7.0
1 Ohio 2001 1.0 8.0 9.0
2 Ohio 2002 2.0 10.0 11.0
3 Nevada 2001 3.0 0.0 1.0
4 Nevada 2002 4.0 NaN NaN
4 Nevada 2000 NaN 2.0 3.0
"""
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]],
index=['a', 'c', 'e'],
columns=['Ohio', 'Nevada'])
"""
Ohio Nevada
a 1.0 2.0
c 3.0 4.0
e 5.0 6.0
"""
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.],[13, 14]],
index=['b', 'c', 'd', 'e'],
columns=['Missouri', 'Alabama'])
"""
Missouri Alabama
b 7.0 8.0
c 9.0 10.0
d 11.0 12.0
e 13.0 14.0
"""
# 2个数据的索引合并
pd.merge(left2, right2, how='outer', left_index=True, right_index=True)
"""
Ohio Nevada Missouri Alabama
a 1.0 2.0 NaN NaN
b NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0
d NaN NaN 11.0 12.0
e 5.0 6.0 13.0 14.0
"""
#join合并索引更便捷,也可以用于合并多个索引相同或者相似但没有重叠列的DataFrame对象
#默认左连接,完全保留左边的DataFrame行索引;可指定连接方式
left2.join(right2, how='outer')
left1.join(right1, on='key') #可指定连接的列
another = pd.DataFrame(
[[7., 8.], [9., 10.], [11., 12.], [16., 17.]],
index=['a', 'c', 'e', 'f'],
columns=['New York','Oregon'])
"""
New York Oregon
a 7.0 8.0
c 9.0 10.0
e 11.0 12.0
f 16.0 17.0
"""
left2.join([right2, another]) #合并多个数据
"""
Ohio Nevada Missouri Alabama New York Oregon
a 1.0 2.0 NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0 9.0 10.0
e 5.0 6.0 13.0 14.0 11.0 12.0
"""
left2.join([right2, another], how='outer')
"""
Ohio Nevada Missouri Alabama New York Oregon
a 1.0 2.0 NaN NaN 7.0 8.0
c 3.0 4.0 9.0 10.0 9.0 10.0
e 5.0 6.0 13.0 14.0 11.0 12.0
b NaN NaN 7.0 8.0 NaN NaN
d NaN NaN 11.0 12.0 NaN NaN
f NaN NaN NaN NaN 16.0 17.0
"""
3、沿轴向连接——concat
数据组合操作可互换地称为拼接、绑定或堆叠。Numpy的concatenate函数可以实现该功能,与之对应的是Pandas的concat函数
参数 | 描述 |
---|---|
objs | 需要连接的pandas对象列表或字典,必选 |
axis | 连接的轴向,默认为0,行的方向 |
join | 指定连接方式,默认为outer,可以为inner |
keys | 与连接对象关联的多层索引列表或数组 |
names | 如果传入了keys或levels参数,指定多层索引的名称 |
levels | 键值传递时指定多层索引的层级 |
verify_integrity | 检查连接对象中新轴是否重复,默认False允许重复,反之引发异常 |
ignore_index | 不保留索引,产生新索引(长度为total_lenght) |
arr = np.arange(12).reshape((3, 4))
np.concatenate([arr, arr], axis=1) #axis=0则为上下组合
"""
array([[ 0, 1, 2, 3, 0, 1, 2, 3],
[ 4, 5, 6, 7, 4, 5, 6, 7],
[ 8, 9, 10, 11, 8, 9, 10, 11]])
"""
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
s4 = pd.concat([s1, s3]) #默认axis=0,上下组合
"""
a 0
b 1
f 5
g 6
dtype: int64
"""
pd.concat([s1, s4], axis=1) #列的轴
"""
0 1
a 0.0 0
b 1.0 1
f NaN 5
g NaN 6
"""
pd.concat([s1, s4], axis=1, join='inner')
"""
0 1
a 0 0
b 1 1
"""
# join_axes指定轴,参数已被弃用,可以使用reindex
# pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b','e']])
pd.concat([s1, s4], axis=1).reindex(index=['a', 'c', 'b','e'])
"""
0 1
a 0.0 0.0
c NaN NaN
b 1.0 1.0
e NaN NaN
"""
# 创建多层索引
result = pd.concat([s1, s1, s3], keys=['one','two', 'three'])
"""
one a 0
b 1
two a 0
b 1
three f 5
g 6
dtype: int64
"""
result.unstack()
"""
a b f g
one 0.0 1.0 NaN NaN
two 0.0 1.0 NaN NaN
three NaN NaN 5.0 6.0
"""
pd.concat([s1, s2, s3], axis=1, keys=['one','two', 'three'])
"""
one two three
a 0.0 NaN NaN
b 1.0 NaN NaN
c NaN 2.0 NaN
d NaN 3.0 NaN
e NaN 4.0 NaN
f NaN NaN 5.0
g NaN NaN 6.0
"""
df1 = pd.DataFrame(
np.arange(6).reshape(3, 2), index=['a', 'b', 'c'],
columns=['one', 'two'])
"""
one two
a 0 1
b 2 3
c 4 5
"""
df2 = pd.DataFrame(
5 + np.arange(4).reshape(2, 2), index=['a', 'c'],
columns=['three', 'four'])
"""
three four
a 5 6
c 7 8
"""
# 创建多层索引
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])
"""
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
"""
#字典的key作为索引
pd.concat({'level1': df1, 'level2': df2}, axis=1)
"""
level1 level2
one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
"""
# 给多层索引命名
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],
names=['upper', 'lower'])
"""
upper level1 level2
lower one two three four
a 0 1 5.0 6.0
b 2 3 NaN NaN
c 4 5 7.0 8.0
"""
# 索引中不包含任何相关数
pd.concat([df1, df2], ignore_index=True)
4、联合重叠数据
两个数据集的所有可能全部或者部分重叠。
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
index=['f', 'e', 'd', 'c', 'b', 'a'])
b = pd.Series([0, np.nan, 2., np.nan, np.nan, 5],
index=['f', 'e', 'd', 'c', 'b', 'a'])
np.where(pd.isnull(a), b, a) #等价于condition? X :Y操作
b.combine_first(a) # b缺失,a的值填充,DataFrame同理
三、重塑与透视
1、使用多层索引进行重塑
stack堆叠:旋转,列->行
unstack拆堆:行->列
拆堆可能会引入缺失值,默认情况下,堆叠会过滤缺失值,堆叠和拆堆是可逆的
在拆堆时,被拆的层级为结果中最低的层级
拆堆和堆叠都可以指定轴的名字或者序号
data = pd.DataFrame(
np.arange(6).reshape((2, 3)),
index=pd.Index(['Ohio','Colorado'], name='state'),
columns=pd.Index(['one', 'two', 'three'], name='number'))
"""
number one two three
state
Ohio 0 1 2
Colorado 3 4 5
"""
result = data.stack()
"""
state number
Ohio one 0
two 1
three 2
Colorado one 3
two 4
three 5
dtype: int32
"""
#还原,与result.unstack(1/'number')结果一致
result.unstack()
#指定轴序号,index与columns互换
result.unstack(0)
"""
state Ohio Colorado
number
one 0 3
two 1 4
three 2 5
"""
result.unstack('state') #指定轴名,结果与result.unstack(0)一致
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one', 'two'])
"""
one a 0
b 1
c 2
d 3
two c 4
d 5
e 6
dtype: int64
"""
data2.unstack().stack() #结果与data2一致
data2.unstack().stack(dropna=False) #不过滤缺失值
"""
one a 0.0
b 1.0
c 2.0
d 3.0
e NaN
two a NaN
b NaN
c 4.0
d 5.0
e 6.0
dtype: float64
"""
df = pd.DataFrame({'left': result, 'right': result + 5},
columns=pd.Index(['left', 'right'],name='side'))
"""
side left right
state number
Ohio one 0 5
two 1 6
three 2 7
Colorado one 3 8
two 4 9
three 5 10
"""
df.unstack('state')
"""
side left right
state Ohio Colorado Ohio Colorado
number
one 0 3 5 8
two 1 4 6 9
three 2 5 7 10
"""
#与df.unstack('state').stack(1)结果一致
df.unstack('state').stack('side')
"""
side left right
number state
one Ohio 0 5
Colorado 3 8
two Ohio 1 6
Colorado 4 9
three Ohio 2 7
Colorado 5 10
"""
2、将长透视为宽
没看明白,未完待续
3、将宽透视为长
多列合成一列
,产生新的DataFrame
df = pd.DataFrame({'key': ['foo', 'bar', 'baz'],
'A': [1, 2, 3],
'B': [4, 5, 6],
'C': [7, 8, 9]})
"""
key A B C
0 foo 1 4 7
1 bar 2 5 8
2 baz 3 6 9
"""
melted = pd.melt(df, ['key']) #key作为唯一分组的指标
"""
key variable value
0 foo A 1
1 bar A 2
2 baz A 3
3 foo B 4
4 bar B 5
5 baz B 6
6 foo C 7
7 bar C 8
8 baz C 9
"""
# reshaped = melted.pivot('key', 'variable', 'value') #报错
# 还原pivot根据行标签的列生成的索引
reshaped = melted.pivot(index='key', columns='variable',
values='value')
"""
variable A B C
key
bar 2 5 8
baz 3 6 9
foo 1 4 7
"""
reshaped.reset_index() #重置索引
"""
variable key A B C
0 bar 2 5 8
1 baz 3 6 9
2 foo 1 4 7
"""
# 选择子集,指定分组指标为key
pd.melt(df, id_vars=['key'], value_vars=['A', 'B'])
"""
key variable value
0 foo A 1
1 bar A 2
2 baz A 3
3 foo B 4
4 bar B 5
5 baz B 6
"""
pd.melt(df, value_vars=['A', 'B', 'C']) #不指定分组指标
"""
variable value
0 A 1
1 A 2
2 A 3
3 B 4
4 B 5
5 B 6
6 C 7
7 C 8
8 C 9
"""
pd.melt(df, value_vars=['key', 'A', 'B'])
"""
variable value
0 key foo
1 key bar
2 key baz
3 A 1
4 A 2
5 A 3
6 B 4
7 B 5
8 B 6
"""
总结
Practice makes perfect,唯有实践出真知。
接下来就是数据可视化与分析
参考文献
《利用Python进行数据分析》原书第二版中译
完美解决TypeError: concat() got an unexpected keyword argument ‘join_axes‘
python——combine_first()函数
TypeError: pivot() takes 1 positional argument but 4 were given