第5章 合并
import pandas as pd
import numpy as np
df = pd.read_csv('data/table.csv')
df.head()
|
School |
Class |
ID |
Gender |
Address |
Height |
Weight |
Math |
Physics |
0 |
S_1 |
C_1 |
1101 |
M |
street_1 |
173 |
63 |
34.0 |
A+ |
1 |
S_1 |
C_1 |
1102 |
F |
street_2 |
192 |
73 |
32.5 |
B+ |
2 |
S_1 |
C_1 |
1103 |
M |
street_2 |
186 |
82 |
87.2 |
B+ |
3 |
S_1 |
C_1 |
1104 |
F |
street_2 |
167 |
81 |
80.4 |
B- |
4 |
S_1 |
C_1 |
1105 |
F |
street_4 |
159 |
64 |
84.8 |
B+ |
一、append与assign
1. append方法
(a)利用序列添加行(必须指定name)
df_append = df.loc[:3,['Gender','Height']].copy()
df_append
|
Gender |
Height |
0 |
M |
173 |
1 |
F |
192 |
2 |
M |
186 |
3 |
F |
167 |
s = pd.Series({
'Gender':'F','Height':188},name='new_row')
df_append.append(s)
|
Gender |
Height |
0 |
M |
173 |
1 |
F |
192 |
2 |
M |
186 |
3 |
F |
167 |
new_row |
F |
188 |
df_temp = pd.DataFrame({
'Gender':['F','M'],'Height':[188,176]},index=['new_1','new_2'])
df_append.append(df_temp)
|
Gender |
Height |
0 |
M |
173 |
1 |
F |
192 |
2 |
M |
186 |
3 |
F |
167 |
new_1 |
F |
188 |
new_2 |
M |
176 |
2. assign方法
该方法主要用于添加列,列名直接由参数指定:
s = pd.Series(list('abcd'),index=range(4))
df_append.assign(letter = s)
|
Gender |
Height |
letter |
0 |
M |
173 |
a |
1 |
F |
192 |
b |
2 |
M |
186 |
c |
3 |
F |
167 |
d |
df_append.assign(col1=lambda x:x['Gender']*2,
col2=s,
col3=s)
|
Gender |
Height |
col1 |
col2 |
col3 |
0 |
M |
173 |
MM |
a |
a |
1 |
F |
192 |
FF |
b |
b |
2 |
M |
186 |
MM |
c |
c |
3 |
F |
167 |
FF |
d |
d |
二、combine与update
1. comine方法
comine和update都是用于表的填充函数,可以根据某种规则填充
(a)填充对象
可以看出combine方法是按照表的顺序轮流进行逐列循环的,而且自动索引对齐,缺失值为NaN,理解这一点很重要
df_combine_1 = df.loc[:1,['Gender','Height']].copy()
df_combine_2 = df.loc[10:11,['Gender','Height']].copy()
df_combine_1.combine(df_combine_2,lambda x,y:x)
|
Gender |
Height |
0 |
M |
173.0 |
1 |
F |
192.0 |
10 |
NaN |
NaN |
11 |
NaN |
NaN |
df_combine_1
|
Gender |
Height |
0 |
M |
173 |
1 |
F |
192 |
(b)一些例子
例①:根据列均值的大小填充
df1 = pd.DataFrame({
'A': [1, 2], 'B': [3, 4.]})
df2 = pd.DataFrame({
'A': [0, 0,0], 'B': [6, 5,0]})
df1.combine(df2,lambda x,y: x if x.mean()>y.mean() else y)
|
A |
B |
0 |
1.0 |
6.0 |
1 |
2.0 |
5.0 |
2 |
NaN |
0.0 |
例②:索引对齐特性(默认状态下,后面的表没有的行列都会设置为NaN)
df2 = pd.DataFrame({
'B': [8, 7], 'C': [6, 5]},index=[1,2])
df1.combine(df2,lambda x,y:x if x.mean()>y.mean() else y)
|
A |
B |
C |
0 |
NaN |
NaN |
NaN |
1 |
NaN |
8.0 |
6.0 |
2 |
NaN |
7.0 |
5.0 |
例③:使得df1原来符合条件的值不会被覆盖
df1.combine(df2,lambda x,y:x if x.mean(