Data Wrangling: Join, Combine,¶
# 第8章 数据规整:聚合、合并和重塑# 在许多应⽤中,数据可能分散在许多⽂件或数据库中,存储的形# 式也不利于分析。本章关注可以聚合、合并、重塑数据的⽅法。import numpy as np
import pandas as pd
pd.options.display.max_rows =20
np.random.seed(12345)import matplotlib.pyplot as plt
plt.rc('figure', figsize=(10,6))
np.set_printoptions(precision=4, suppress=True)
Hierarchical Indexing
8.1 层次化索引
# 层次化索引(hierarchical indexing)是pandas的⼀项重要功# 能,它使你能在⼀个轴上拥有多个(两个以上)索引级别。抽象# 点说,它使你能以低维度形式处理⾼维度数据。# 8.1 层次化索引# 层次化索引(hierarchical indexing)是pandas的⼀项重要功# 能,它使你能在⼀个轴上拥有多个(两个以上)索引级别。抽象# 点说,它使你能以低维度形式处理⾼维度数据。
data = pd.Series(np.random.randn(9),
index=[['a','a','a','b','b','c','c','d','d'],[1,2,3,1,3,1,2,2,3]])
data
a 1-0.20470820.4789433-0.519439
b 1-0.55573031.965781
c 11.39340620.092908
d 20.28174630.769023
dtype: float64
data.index
MultiIndex(levels=[['a','b','c','d'],[1,2,3]],
labels=[[0,0,0,1,1,2,2,3,3],[0,1,2,0,2,0,1,1,2]])
data['b']
data['b':'c']
data.loc[['b','d']]
data.loc[:,2]# 层次化索引在数据重塑和基于分组的操作(如透视表⽣成)中扮# 演着重要的⻆⾊。例如,可以通过unstack⽅法将这段数据重新# 安排到⼀个DataFrame中:
data.unstack()123
a -0.2047080.478943-0.519439
b -0.555730 NaN 1.965781
c 1.3934060.092908 NaN
d NaN 0.2817460.769023##unstack的逆运算是stack:
data.unstack().stack()
a 1-0.20470820.4789433-0.519439
b 1-0.55573031.965781
c 11.39340620.092908
d 20.28174630.769023
dtype: float64
frame = pd.DataFrame(np.arange(12).reshape((4,3)),
index=[['a','a','b','b'],[1,2,1,2]],
columns=[['Ohio','Ohio','Colorado'],['Green','Red','Green']])
frame
frame.index.names =['key1','key2']
frame.columns.names =['state','color']
frame
frame['Ohio']
MultiIndex.from_arrays([['Ohio','Ohio','Colorado'],['Green','Red','Green']], names=['state','color'])