参考链接:link
第一次上传
data = {
'Id':[0,1,2,3,4,5,6,7],
'Name':['Alen','Bob','Cidy','Daniel','Ellen','Frankie','Gate','Hebe'],
'Gender':['Male','Male','Female','Male','Female','Male','Male','Female'],
'Age':[18,19,18,20,17,21,20,22],
'Score':[80,90,93,87,96,100,88,98],
#'Timestamp':[1506959142820,1506959172820,1506959056066,1506959086066,1506959088613,1506959118613]
}
df = pd.DataFrame(data)
grouped = df.groupby('Gender')
print(type(grouped))
print(grouped)#<class 'pandas.core.groupby.groupby.DataFrameGroupBy'>
grouped = df.groupby('Gender')
grouped_muti = df.groupby(['Gender', 'Age'])#主要起的作用还是计数的作用,必定会有一列作为计数使用的。
print('===='*10)
print(grouped.size())
print('===='*10)
print(grouped_muti.size())
print('===='*10)
print(grouped.get_group('Female'))#获取指定的东西
print('===='*10)
print(grouped_muti.get_group(('Female', 17)))#使用过get_group之后,数据类型发生了改变。
df = grouped.get_group('Female').reset_index()#这个是索引重新定义
print('===='*10)
print(df)
print('===='*10)#如果使用过max()、count()、std()等,返回的结果是一个DataFrame对象。
print(grouped.count())#将gender作为主要的一列,ID,Name,Age,Score作为次要的列
print('===='*10)
print(grouped.max()[['Age', 'Score']])#只取特定的列
print('===='*10)
print(grouped.max())#默认取出来所有的列
print('===='*10)
print(grouped.mean()[['Age', 'Score']])
输出结果值:
Id Name Gender Age Score
0 0 Alen Male 18 80
1 1 Bob Male 19 90
2 2 Cidy Female 18 93
3 3 Daniel Male 20 87
4 4 Ellen Female 17 96
5 5 Frankie Male 21 100
6 6 Gate Male 20 88
7 7 Hebe Female 22 98
<class 'pandas.core.groupby.generic.DataFrameGroupBy'>
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fba4045a710>
========================================
Gender
Female 3
Male 5
dtype: int64
========================================
Gender Age
Female 17 1
18 1
22 1
Male 18 1
19 1
20 2
21 1
dtype: int64
========================================
Id Name Gender Age Score
2 2 Cidy Female 18 93
4 4 Ellen Female 17 96
7 7 Hebe Female 22 98
========================================
Id Name Gender Age Score
4 4 Ellen Female 17 96
========================================
index Id Name Gender Age Score
0 2 2 Cidy Female 18 93
1 4 4 Ellen Female 17 96
2 7 7 Hebe Female 22 98
========================================
Id Name Age Score
Gender
Female 3 3 3 3
Male 5 5 5 5
========================================
Age Score
Gender
Female 22 98
Male 21 100
========================================
Id Name Age Score
Gender
Female 7 Hebe 22 98
Male 6 Gate 21 100
========================================
Age Score
Gender
Female 19.0 95.666667
Male 19.6 89.000000
Process finished with exit code 0
第二次上传
import pandas as pd
data = {
'Id':[0,1,2,3,4,5,6,7],
'Name':['Alen','Bob','Cidy','Daniel','Ellen','Frankie','Gate','Hebe'],
'Gender':['Male','Male','Female','Male','Female','Male','Male','Female'],
'Age':[18,19,18,20,17,21,20,22],
'Score':[80,90,93,87,96,100,88,98],
#'Timestamp':[1506959142820,1506959172820,1506959056066,1506959086066,1506959088613,1506959118613]
}
df = pd.DataFrame(data)
grouped = df.groupby('Gender')
print(df)
print(type(grouped))
print(grouped)#<class 'pandas.core.groupby.groupby.DataFrameGroupBy'>
grouped = df.groupby('Gender')
grouped_muti = df.groupby(['Gender', 'Age'])#主要起的作用还是计数的作用,必定会有一列作为计数使用的。
print('===='*10)
print(grouped.size())
print('===='*10)
print(grouped_muti.size())
print('===='*10)
print(grouped.count())
print('===='*10)
print(grouped.mean())
print('===='*10)
print(grouped.sum())
print('===='*10)
print(grouped_muti['Name'].count())
agg_dict = {
'Age': ['sum', 'mean'],
'Score': ['max', 'min']
}
multi_grouped_agg = df.groupby('Gender').agg(agg_dict)
print('===='*10)
print(multi_grouped_agg)
grouped_custom = df.groupby('Gender')['Score']
print('===='*10)
print(grouped_custom.count())
def custom_aggregation(arr):
return arr.max() - arr.min()
grouped_custom = df.groupby('Gender')['Age'].agg(custom_aggregation)# Female:22-17=5.male:21-18=3
print('===='*10)
print(grouped_custom)
'''
对分组应用变换:
可以使用 transform 方法将计算结果广播回原始 DataFrame。
'''
df['GroupedSum'] = df.groupby('Gender')['Age'].transform('sum')#Female和Male应该是相同的值。
print('===='*10)
print(df)
'''
使用 apply 对分组应用自定义函数:
'''
def custom_function(group):
return group['Age'] * group['Score']
result = df.groupby('Gender').apply(custom_function)
print('===='*10)
print(result)
#测试函数的使用
E:\anacondaanzhuangweizhi\envs\ashy\python.exe E:/ZGW/PycharmProjects1/pythonProject1/Data_an/FinancialRiskControl/groupby.py
E:\anacondaanzhuangweizhi\envs\ashy\lib\site-packages\numpy\_distributor_init.py:30: UserWarning: loaded more than 1 DLL from .libs:
E:\anacondaanzhuangweizhi\envs\ashy\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
E:\anacondaanzhuangweizhi\envs\ashy\lib\site-packages\numpy\.libs\libopenblas64__v0.3.21-gcc_10_3_0.dll
warnings.warn("loaded more than 1 DLL from .libs:"
Id Name Gender Age Score
0 0 Alen Male 18 80
1 1 Bob Male 19 90
2 2 Cidy Female 18 93
3 3 Daniel Male 20 87
4 4 Ellen Female 17 96
5 5 Frankie Male 21 100
6 6 Gate Male 20 88
7 7 Hebe Female 22 98
<class 'pandas.core.groupby.generic.DataFrameGroupBy'>
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x0000024E17252D30>
========================================
Gender
Female 3
Male 5
dtype: int64
========================================
Gender Age
Female 17 1
18 1
22 1
Male 18 1
19 1
20 2
21 1
dtype: int64
========================================
Id Name Age Score
Gender
Female 3 3 3 3
Male 5 5 5 5
========================================
Id Age Score
Gender
Female 4.333333 19.0 95.666667
Male 3.000000 19.6 89.000000
========================================
Id Age Score
Gender
Female 13 57 287
Male 15 98 445
========================================
Gender Age
Female 17 1
E:\ZGW\PycharmProjects1\pythonProject1\Data_an\FinancialRiskControl\groupby.py:24: FutureWarning: The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
print(grouped.mean())
E:\ZGW\PycharmProjects1\pythonProject1\Data_an\FinancialRiskControl\groupby.py:26: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
print(grouped.sum())
18 1
22 1
Male 18 1
19 1
20 2
21 1
Name: Name, dtype: int64
========================================
Age Score
sum mean max min
Gender
Female 57 19.0 98 93
Male 98 19.6 100 80
========================================
Gender
Female 3
Male 5
Name: Score, dtype: int64
========================================
Gender
Female 5
Male 3
Name: Age, dtype: int64
========================================
Id Name Gender Age Score GroupedSum
0 0 Alen Male 18 80 98
1 1 Bob Male 19 90 98
2 2 Cidy Female 18 93 57
3 3 Daniel Male 20 87 98
4 4 Ellen Female 17 96 57
5 5 Frankie Male 21 100 98
6 6 Gate Male 20 88 98
7 7 Hebe Female 22 98 57
========================================
Gender
Female 2 1674
4 1632
7 2156
Male 0 1440
1 1710
3 1740
5 2100
6 1760
dtype: int64
Process finished with exit code 0