pandas
-
聚合常用于分组后的数据进行计算
- 内置的聚合函数
- sum(),mean(),max(),min(),count(),size(),describe()
- 也可自定义函数,传入agg方法中
- grouped.agg(func)
- func的参数为groupby索引对应的记录
- 常用的内置聚合函数
1.数据分组运算
- 数据的聚合运算改变了原始数据的shape
- 使用merge保持原始数据的shape
import pandas as pd
import numpy as np
# 分组运算后保持shape
dict_obj = {'key1' : ['a', 'b', 'a', 'b',
'a', 'b', 'a', 'a'],
'key2' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'data1': np.random.randint(1, 10, 8),
'data2': np.random.randint(1, 10, 8)}
df_obj = pd.DataFrame(dict_obj)
print(df_obj)
data1 data2 key1 key2
0 4 9 a one
1 8 4 b one
2 7 5 a two
3 8 5 b three
4 5 2 a two
5 7 8 b two
6 5 3 a one
7 4 1 a three
# 按key1分组后,计算data1,data2的统计信息并附加到原始表格中
k1_sum = df_obj.groupby('key1').sum().add_prefix('sum_')
k1_sum
sum_data1 sum_data2
key1
a 25 20
b 23 17
# 使用merge,将统计信息加入到原df
pd.merge(df_obj, k1_sum, left_on='key1', right_index=True)
data1 data2 key1 key2 sum_data1 sum_data2
0 4 9 a one 25 20
2 7 5 a two 25 20
4 5 2 a two 25 20
6 5 3 a one 25 20
7 4 1 a three 25 20
1 8 4 b one 23 17
3 8 5 b three 23 17
5 7 8 b two 23 17
举个应用例子。
读取excel数据
dataset_path = './starcraft.csv'
df_data = pd.read_csv(dataset_path, usecols=['LeagueIndex', 'Age', 'HoursPerWeek',
'TotalHours', 'APM'])
df_data.head()
LeagueIndex Age HoursPerWeek TotalHours APM
0 5 27.0 10.0 3000.0 143.7180
1 5 23.0 10.0 5000.0 129.2322
2 4 30.0 10.0 200.0 69.9612
3 3 19.0 20.0 400.0 107.6016
4 3 32.0 10.0 500.0 122.8908
#使用GroupBy分组,再使用apply提取信息
def top_n(df, n=3, column='APM'):
"""
返回每个分组按 column 的 top n 数据
"""
return df.sort_values(by=column, ascending=1)[:n]
df_data.groupby('LeagueIndex').apply(top_n).head()
LeagueIndex Age HoursPerWeek TotalHours APM
LeagueIndex
1 3298 1 37.0 12.0 300.0 22.0596
1311 1 22.0 24.0 100.0 24.9042
3155 1 30.0 12.0 150.0 25.4442
2 1542 2 23.0 2.0 50.0 24.6648
887 2 16.0 20.0 100.0 30.9402
# apply函数接收的参数会传入自定义的函数中
df_data.groupby('LeagueIndex').apply(top_n, n=2, column='Age')
LeagueIndex Age HoursPerWeek TotalHours APM
LeagueIndex
1 3146 1 40.0 12.0 150.0 38.5590
3040 1 39.0 10.0 500.0 29.8764
2 920 2 43.0 10.0 730.0 86.0586
2437 2 41.0 4.0 200.0 54.2166
3 1258 3 41.0 14.0 800.0 77.6472
2972 3 40.0 10.0 500.0 60.5970
4 1696 4 44.0 6.0 500.0 89.5266
1729 4 39.0 8.0 500.0 86.7246
5 202 5 37.0 14.0 800.0 327.7218
2745 5 37.0 18.0 1000.0 123.4098
6 3069 6 31.0 8.0 800.0 133.1790
2706 6 31.0 8.0 700.0 66.9918
7 2813 7 26.0 36.0 1300.0 188.5512
1992 7 26.0 24.0 1000.0 219.6690
8 3340 8 NaN NaN NaN 189.7404
3341 8 NaN NaN NaN 287.8128
#禁止分组 group_keys=False
df_data.groupby('LeagueIndex', group_keys=False).apply(top_n)
LeagueIndex Age HoursPerWeek TotalHours APM
2214 1 20.0 12.0 730.0 172.9530
2246 1 27.0 8.0 250.0 141.6282
1753 1 20.0 28.0 100.0 139.6362
3062 2 20.0 6.0 100.0 179.6250
3229 2 16.0 24.0 110.0 156.7380
1520 2 29.0 6.0 250.0 151.6470
1557 3 22.0 6.0 200.0 226.6554
484 3 19.0 42.0 450.0 220.0692
2883 3 16.0 8.0 800.0 208.9500
2688 4 26.0 24.0 990.0 249.0210
2.merge
import pandas as pd
import numpy as np
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b','f'],
'data1' : np.random.randint(0,10,8)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data2' : np.random.randint(0,10,3)})
print(df_obj1)
print(df_obj2)
key data1
0 b 4
1 b 5
2 a 7
3 c 6
4 a 3
5 a 7
6 b 8
7 f 4
key data2
0 a 5
1 b 4
2 d 7
# 默认将重叠列的列名作为“外键”进行连接
pd.merge(df_obj1, df_obj2)
key data1 data2
0 b 4 4
1 b 5 4
2 b 8 4
3 a 7 5
4 a 3 5
5 a 7 5
# on显示指定“外键”
pd.merge(df_obj1, df_obj2, on='key')
data1 key data2
0 5 b 3
1 9 b 3
2 0 b 3
3 1 a 9
4 3 a 9
5 9 a 9
# left_on,right_on分别指定左侧数据和右侧数据的“外键”
# 更改列名
df_obj1 = df_obj1.rename(columns={'key':'key1'})
df_obj2 = df_obj2.rename(columns={'key':'key2'})
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2')
key1 data1 key2 data2
0 b 4 b 4
1 b 5 b 4
2 b 8 b 4
3 a 7 a 5
4 a 3 a 5
5 a 7 a 5
# “外连接”
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer')
key1 data1 key2 data2
0 b 4.0 b 4.0
1 b 5.0 b 4.0
2 b 8.0 b 4.0
3 a 7.0 a 5.0
4 a 3.0 a 5.0
5 a 7.0 a 5.0
6 c 6.0 NaN NaN
7 f 4.0 NaN NaN
8 NaN NaN d 7.0
# 左连接
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left')
key1 data1 key2 data2
0 b 4 b 4.0
1 b 5 b 4.0
2 a 7 a 5.0
3 c 6 NaN NaN
4 a 3 a 5.0
5 a 7 a 5.0
6 b 8 b 4.0
7 f 4 NaN NaN
# 右连接
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right')
data1 key1 data2 key2
0 5.0 b 3 b
1 9.0 b 3 b
2 0.0 b 3 b
3 1.0 a 9 a
4 3.0 a 9 a
5 9.0 a 9 a
6 NaN NaN 8 d
# 处理重复列名
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
'data' : np.random.randint(0,10,3)})
pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left', '_right'))
# 按索引连接 right_index=True 或者left_index=True
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])
pd.merge(df_obj1, df_obj2, left_on='key', right_index=True)
data1 key data2
0 6 b 5
1 7 b 5
6 9 b 5
2 2 a 3
4 4 a 3
5 4 a 3