defmake_df(cols, ind):"""一个简单的DataFrame"""
data ={c:[str(c)+str(i)for i in ind]for c in cols}return pd.DataFrame(data, index=ind)# DataFrame示例
make_df('ABC',range(3))
A
B
C
0
A0
B0
C0
1
A1
B1
C1
2
A2
B2
C2
#np数组的合并
x =[[1,2],[3,4]]
np.concatenate([x, x], axis=1)
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
employee hire_date
0 Lisa 2004
1 Bob 2008
2 Jake 2012
3 Sue 2014
employee
group
hire_date
0
Bob
Accounting
2008
1
Jake
Engineering
2012
2
Lisa
Engineering
2004
3
Sue
HR
2014
#merge:设置数据合并的键#最简单的方法就是直接将参数 on 设置为一个列名字符串或者一个包含多列名称的列表print(df1);print(df2);print(pd.merge(df1, df2, on='employee'))#这个参数只能在两个 DataFrame 有共同列名的时候才可以使用
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
employee hire_date
0 Lisa 2004
1 Bob 2008
2 Jake 2012
3 Sue 2014
employee group hire_date
0 Bob Accounting 2008
1 Jake Engineering 2012
2 Lisa Engineering 2004
3 Sue HR 2014
employee group
0 Bob Accounting
1 Jake Engineering
2 Lisa Engineering
3 Sue HR
name salary
0 Bob 70000
1 Jake 80000
2 Lisa 120000
3 Sue 90000
employee group name salary
0 Bob Accounting Bob 70000
1 Jake Engineering Jake 80000
2 Lisa Engineering Lisa 120000
3 Sue HR Sue 90000
group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
hire_date
employee
Lisa 2004
Bob 2008
Jake 2012
Sue 2014
group hire_date
employee
Bob Accounting 2008
Jake Engineering 2012
Lisa Engineering 2004
Sue HR 2014
group
employee
Bob Accounting
Jake Engineering
Lisa Engineering
Sue HR
hire_date
employee
Lisa 2004
Bob 2008
Jake 2012
Sue 2014
group hire_date
employee
Bob Accounting 2008
Jake Engineering 2012
Lisa Engineering 2004
Sue HR 2014
name food
0 Peter fish
1 Paul beans
2 Mary bread
name drink
0 Mary wine
1 Joseph beer
name food drink
0 Mary bread wine
name food drink
0 Peter fish NaN
1 Paul beans NaN
2 Mary bread wine
3 Joseph NaN beer
name food drink
0 Peter fish NaN
1 Paul beans NaN
2 Mary bread wine
name rank
0 Bob 1
1 Jake 2
2 Lisa 3
3 Sue 4
name rank
0 Bob 3
1 Jake 1
2 Lisa 4
3 Sue 2
name rank_x rank_y
0 Bob 1 3
1 Jake 2 1
2 Lisa 3 4
3 Sue 4 2
name rank_L rank_R
0 Bob 1 3
1 Jake 2 1
2 Lisa 3 4
3 Sue 4 2
key data1 data2
0 A 0 5
1 B 1 0
2 C 2 3
3 A 3 3
4 B 4 7
5 C 5 9
data1 data2
key
A 2.12132 1.414214
B 2.12132 4.949747
C 2.12132 4.242641
key data1 data2
1 B 1 0
2 C 2 3
4 B 4 7
5 C 5 9
#转换
df.groupby('key').transform(lambda x: x - x.mean())#减去均值
data1
data2
0
-1.5
1.0
1
-1.5
-3.5
2
-1.5
-3.0
3
1.5
-1.0
4
1.5
3.5
5
1.5
3.0
#apply() 方法让你可以在每个组上应用任意方法defnorm_by_data2(x):# x是一个分组数据的DataFrame
x['data1']/= x['data2'].sum()return x
print(df);print(df.groupby('key').apply(norm_by_data2))
key data1 data2
0 A 0 5
1 B 1 0
2 C 2 3
3 A 3 3
4 B 4 7
5 C 5 9
key data1 data2
0 A 0.000000 5
1 B 0.142857 0
2 C 0.166667 3
3 A 0.375000 3
4 B 0.571429 7
5 C 0.416667 9
#将列表、数组、Series 或索引作为分组键。分组键可以是长度与 DataFrame 匹配的任意Series 或列表,例如:
L =[0,1,0,1,2,0]print(df);print(df.groupby(L).sum())
key data1 data2
0 A 0 5
1 B 1 0
2 C 2 3
3 A 3 3
4 B 4 7
5 C 5 9
data1 data2
0 7 17
1 4 3
2 4 7
#用字典或 Series 将索引映射到分组名称
df2 = df.set_index('key')
mapping ={'A':'vowel','B':'consonant','C':'consonant'}print(df2);print(df2.groupby(mapping).sum())
data1 data2
key
A 0 5
B 1 0
C 2 3
A 3 3
B 4 7
C 5 9
data1 data2
consonant 12 19
vowel 3 8
Pandas数据合并(join、merge),累计与分组(groupby)import pandas as pd import numpy as npdef make_df(cols, ind): """一个简单的DataFrame""" data = {c: [str(c) + str(i) for i in ind] for c in cols} return pd.DataFrame(data, index=ind) # DataFrame示例make_df('ABC', r