数据分析--pandas（数据分组聚合运算）

最新推荐文章于 2023-06-05 20:09:56 发布

bboysky45

最新推荐文章于 2023-06-05 20:09:56 发布

阅读量533

点赞数

分类专栏： python 文章标签： pandas groupby

本文链接：https://blog.csdn.net/qq_18668137/article/details/89042438

版权

python 专栏收录该内容

59 篇文章 0 订阅

订阅专栏

pandas

聚合常用于分组后的数据进行计算
内置的聚合函数
- sum(),mean(),max(),min(),count(),size(),describe()
也可自定义函数，传入agg方法中
- grouped.agg(func)
- func的参数为groupby索引对应的记录
常用的内置聚合函数

1.数据分组运算

数据的聚合运算改变了原始数据的shape
使用merge保持原始数据的shape

import pandas as pd
import numpy as np

# 分组运算后保持shape
dict_obj = {'key1' : ['a', 'b', 'a', 'b', 
                      'a', 'b', 'a', 'a'],
            'key2' : ['one', 'one', 'two', 'three',
                      'two', 'two', 'one', 'three'],
            'data1': np.random.randint(1, 10, 8),
            'data2': np.random.randint(1, 10, 8)}
df_obj = pd.DataFrame(dict_obj)
print(df_obj)

	data1	data2	key1	key2
0	4	9	a	one
1	8	4	b	one
2	7	5	a	two
3	8	5	b	three
4	5	2	a	two
5	7	8	b	two
6	5	3	a	one
7	4	1	a	three

# 按key1分组后，计算data1，data2的统计信息并附加到原始表格中
k1_sum = df_obj.groupby('key1').sum().add_prefix('sum_')
k1_sum

sum_data1	sum_data2
key1		
a	25	20
b	23	17

# 使用merge,将统计信息加入到原df
pd.merge(df_obj, k1_sum, left_on='key1', right_index=True)


data1	data2	key1	key2	sum_data1	sum_data2
0	4	9	a	one	25	20
2	7	5	a	two	25	20
4	5	2	a	two	25	20
6	5	3	a	one	25	20
7	4	1	a	three	25	20
1	8	4	b	one	23	17
3	8	5	b	three	23	17
5	7	8	b	two	23	17

举个应用例子。

读取excel数据

dataset_path = './starcraft.csv'
df_data = pd.read_csv(dataset_path, usecols=['LeagueIndex', 'Age', 'HoursPerWeek', 
                                             'TotalHours', 'APM'])
df_data.head()
	LeagueIndex	Age	HoursPerWeek	TotalHours	APM
0	5	27.0	10.0	3000.0	143.7180
1	5	23.0	10.0	5000.0	129.2322
2	4	30.0	10.0	200.0	69.9612
3	3	19.0	20.0	400.0	107.6016
4	3	32.0	10.0	500.0	122.8908

#使用GroupBy分组，再使用apply提取信息

def top_n(df, n=3, column='APM'):
    """
        返回每个分组按 column 的 top n 数据
    """
    return df.sort_values(by=column, ascending=1)[:n]

df_data.groupby('LeagueIndex').apply(top_n).head()

		LeagueIndex	Age	HoursPerWeek	TotalHours	APM
LeagueIndex						
1	3298	1	37.0	12.0	300.0	22.0596
1311	1	22.0	24.0	100.0	24.9042
3155	1	30.0	12.0	150.0	25.4442
2	1542	2	23.0	2.0	50.0	24.6648
887	2	16.0	20.0	100.0	30.9402

# apply函数接收的参数会传入自定义的函数中
df_data.groupby('LeagueIndex').apply(top_n, n=2, column='Age')

	LeagueIndex	Age	HoursPerWeek	TotalHours	APM
LeagueIndex						
1	3146	1	40.0	12.0	150.0	38.5590
3040	1	39.0	10.0	500.0	29.8764
2	920	2	43.0	10.0	730.0	86.0586
2437	2	41.0	4.0	200.0	54.2166
3	1258	3	41.0	14.0	800.0	77.6472
2972	3	40.0	10.0	500.0	60.5970
4	1696	4	44.0	6.0	500.0	89.5266
1729	4	39.0	8.0	500.0	86.7246
5	202	5	37.0	14.0	800.0	327.7218
2745	5	37.0	18.0	1000.0	123.4098
6	3069	6	31.0	8.0	800.0	133.1790
2706	6	31.0	8.0	700.0	66.9918
7	2813	7	26.0	36.0	1300.0	188.5512
1992	7	26.0	24.0	1000.0	219.6690
8	3340	8	NaN	NaN	NaN	189.7404
3341	8	NaN	NaN	NaN	287.8128

#禁止分组 group_keys=False

df_data.groupby('LeagueIndex', group_keys=False).apply(top_n)

	LeagueIndex	Age	HoursPerWeek	TotalHours	APM
2214	1	20.0	12.0	730.0	172.9530
2246	1	27.0	8.0	250.0	141.6282
1753	1	20.0	28.0	100.0	139.6362
3062	2	20.0	6.0	100.0	179.6250
3229	2	16.0	24.0	110.0	156.7380
1520	2	29.0	6.0	250.0	151.6470
1557	3	22.0	6.0	200.0	226.6554
484	3	19.0	42.0	450.0	220.0692
2883	3	16.0	8.0	800.0	208.9500
2688	4	26.0	24.0	990.0	249.0210

2.merge

import pandas as pd
import numpy as np

df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b','f'],
                        'data1' : np.random.randint(0,10,8)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                        'data2' : np.random.randint(0,10,3)})

print(df_obj1)
print(df_obj2)
   key  data1
0   b      4
1   b      5
2   a      7
3   c      6
4   a      3
5   a      7
6   b      8
7   f      4
  key  data2
0   a      5
1   b      4
2   d      7

# 默认将重叠列的列名作为“外键”进行连接
pd.merge(df_obj1, df_obj2)


     key    data1    data2
0	b	4	4
1	b	5	4
2	b	8	4
3	a	7	5
4	a	3	5
5	a	7	5

# on显示指定“外键”
pd.merge(df_obj1, df_obj2, on='key')

	data1	key	data2
0	5	b	3
1	9	b	3
2	0	b	3
3	1	a	9
4	3	a	9
5	9	a	9

# left_on，right_on分别指定左侧数据和右侧数据的“外键”

# 更改列名
df_obj1 = df_obj1.rename(columns={'key':'key1'})
df_obj2 = df_obj2.rename(columns={'key':'key2'})
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2')

    key1	data1	key2	data2
0	b	4	b	4
1	b	5	b	4
2	b	8	b	4
3	a	7	a	5
4	a	3	a	5
5	a	7	a	5

# “外连接”
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='outer')

key1	data1	key2	data2
0	b	4.0	b	4.0
1	b	5.0	b	4.0
2	b	8.0	b	4.0
3	a	7.0	a	5.0
4	a	3.0	a	5.0
5	a	7.0	a	5.0
6	c	6.0	NaN	NaN
7	f	4.0	NaN	NaN
8	NaN	NaN	d	7.0

# 左连接
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='left')
key1	data1	key2	data2
0	b	4	b	4.0
1	b	5	b	4.0
2	a	7	a	5.0
3	c	6	NaN	NaN
4	a	3	a	5.0
5	a	7	a	5.0
6	b	8	b	4.0
7	f	4	NaN	NaN

# 右连接
pd.merge(df_obj1, df_obj2, left_on='key1', right_on='key2', how='right')
data1	key1	data2	key2
0	5.0	b	3	b
1	9.0	b	3	b
2	0.0	b	3	b
3	1.0	a	9	a
4	3.0	a	9	a
5	9.0	a	9	a
6	NaN	NaN	8	d

# 处理重复列名
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                        'data' : np.random.randint(0,10,3)})

pd.merge(df_obj1, df_obj2, on='key', suffixes=('_left', '_right'))

# 按索引连接    right_index=True  或者left_index=True
df_obj1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'a', 'b'],
                        'data1' : np.random.randint(0,10,7)})
df_obj2 = pd.DataFrame({'data2' : np.random.randint(0,10,3)}, index=['a', 'b', 'd'])
pd.merge(df_obj1, df_obj2, left_on='key', right_index=True)
    data1	key	data2
0	6	b	5
1	7	b	5
6	9	b	5
2	2	a	3
4	4	a	3
5	4	a	3