往期文章:
pandas使用技巧
合并
Pandas在join/merge两中情境下提供了支持多种方式,基于逻辑/集合运算和代数运算来连接Series,DataFrame和Panel对象。
# concat()方法连接数组
df = pd.DataFrame(np.random.randn(8, 4))
print(df)
print("------")
# 拆分成块
pieces = [df[:2], df[3:5], df[6:]]
# 重新连接,可得初始数组
print(pd.concat(pieces))
# 输出
0 1 2 3
0 0.138889 1.468425 0.033937 0.727453
1 -1.534917 1.880785 1.038571 0.782657
2 -0.758474 -0.889903 1.895584 -0.349080
3 1.195926 0.219143 0.141360 -0.133861
4 1.399775 2.084911 0.370451 0.373722
5 0.602006 0.102370 1.408794 2.076663
6 0.208999 3.121470 0.292169 -0.522401
7 -0.141231 0.146212 0.058596 0.423607
------
0 1 2 3
0 0.138889 1.468425 0.033937 0.727453
1 -1.534917 1.880785 1.038571 0.782657
3 1.195926 0.219143 0.141360 -0.133861
4 1.399775 2.084911 0.370451 0.373722
6 0.208999 3.121470 0.292169 -0.522401
7 -0.141231 0.146212 0.058596 0.423607
增补
df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])
print(df)
print("------")
# 将索引为3的行增补到整个DataFrame最后
s = df.iloc[3]
print(df.append(s, ignore_index=True))
# 输出
A B C D
0 -1.691575 1.282698 1.939419 0.459406
1 0.666460 -0.146645 0.672000 1.040926
2 1.031160 -1.248344 0.569695 -0.566937
3 0.833353 -1.453265 0.783527 -0.278597
4 -0.069810 1.577667 -0.052604 0.535070
5 -0.355505 1.752383 0.617767 -0.886470
6 0.239778 -1.342020 -0.966699 -0.183925
7 0.557462 0.134796 -0.766296 -1.665554
------
A B C D
0 -1.691575 1.282698 1.939419 0.459406
1 0.666460 -0.146645 0.672000 1.040926
2 1.031160 -1.248344 0.569695 -0.566937
3 0.833353 -1.453265 0.783527 -0.278597
4 -0.069810 1.577667 -0.052604 0.535070
5 -0.355505 1.752383 0.617767 -0.886470
6 0.239778 -1.342020 -0.966699 -0.183925
7 0.557462 0.134796 -0.766296 -1.665554
8 0.833353 -1.453265 0.783527 -0.278597
组合
# 新建DataFrame对象df
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8)})
print(df)
print("------")
# 对'A'列进行合并并应用.sum()函数
print(df.groupby('A').sum())
print("------")
# 对'A', 'B'两列分别合并形成层级结构,再应用.sum()函数
print(df.groupby(['A','B']).sum())
# 输出
A B C D
0 foo one -0.198964 0.329690
1 bar one 0.549832 2.443202
2 foo two -0.061327 -0.328189
3 bar three -1.342564 0.973223
4 foo two -0.756521 0.246364
5 bar two -0.860024 1.068036
6 foo one 0.587521 -0.734750
7 foo three 1.654651 -0.293895
------
C D
A
bar -1.652756 4.484461
foo 1.225360 -0.780780
------
C D
A B
bar one 0.549832 2.443202
three -1.342564 0.973223
two -0.860024 1.068036
foo one 0.388557 -0.405060
three 1.654651 -0.293895
two -0.817848 -0.081826
重塑
tuples = list(zip(*[
['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'],
['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']]))
# 多重索引
index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])
df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])
df2 = df[:4]
print(df2)
print("------")
# .stack()方法将DataFrame的列“压缩”了一级
stacked = df2.stack()
print(stacked)
# 输出
A B
first second
bar one 0.441812 -0.404707
two 1.049489 0.476338
baz one 0.474724 0.879325
two 1.771712 -0.486847
------
first second
bar one A 0.441812
B -0.404707
two A 1.049489
B 0.476338
baz one A 0.474724
B 0.879325
two A 1.771712
B -0.486847
dtype: float64
# 对于已经层次化的,具有多重索引的DataFrame或Series,
# stack()的逆操作是unstack()——默认将最后一级“去层次化”。
print(stacked.unstack())
print("------")
print(stacked.unstack(1))
print("------")
print(stacked.unstack(0))
# 输出
A B
first second
bar one 0.441812 -0.404707
two 1.049489 0.476338
baz one 0.474724 0.879325
two 1.771712 -0.486847
------
second one two
first
bar A 0.441812 1.049489
B -0.404707 0.476338
baz A 0.474724 1.771712
B 0.879325 -0.486847
------
first bar baz
second
one A 0.441812 0.474724
B -0.404707 0.879325
two A 1.049489 1.771712
B 0.476338 -0.486847
更多内容请查看我的gittee仓库 : Python基础练习