import pandas as pd
import numpy as np
df = pd.DataFrame({'key1': ['a', 'a', 'b', 'b', 'a'],
'key2': ['one', 'two', 'one', 'two', 'one'],
'data1': np.random.randn(5),
'data2': np.random.randn(5)})
df
| data1 | data2 | key1 | key2 |
---|
0 | 1.244597 | 1.243988 | a | one |
---|
1 | -1.253387 | -0.544606 | a | two |
---|
2 | -1.273505 | 1.017159 | b | one |
---|
3 | -0.681958 | -0.187279 | b | two |
---|
4 | -0.683225 | 1.054060 | a | one |
---|
按‘key1’分组
grouped = df.groupby('key1')
grouped
对于分组的某一列或者多个列,应用agg(func)可以对分组后的数据应用func函数。例如:用group1[‘data1’].agg(‘mean’)对分组后的’data1’列求均值。当然也可以推广到同时作用于多个列和使用多个函数上。
grouped.agg('mean')
| data1 | data2 |
---|
key1 | | |
---|
a | -0.230672 | 0.584481 |
---|
b | -0.977731 | 0.414940 |
---|
grouped.agg(['sum','mean'])
| data1 | data2 |
---|
| sum | mean | sum | mean |
---|
key1 | | | | |
---|
a | -0.692015 | -0.230672 | 1.753442 | 0.584481 |
---|
b | -1.955463 | -0.977731 | 0.829881 | 0.414940 |
---|
grouped['data1'].agg(['sum','mean'])
| sum | mean |
---|
key1 | | |
---|
a | -0.692015 | -0.230672 |
---|
b | -1.955463 | -0.977731 |
---|
grouped[['data1']].agg(['sum','mean'])
| data1 |
---|
| sum | mean |
---|
key1 | | |
---|
a | -0.692015 | -0.230672 |
---|
b | -1.955463 | -0.977731 |
---|
pd.pivot_table(df, index='key1')
| data1 | data2 |
---|
key1 | | |
---|
a | -0.230672 | 0.584481 |
---|
b | -0.977731 | 0.414940 |
---|
pd.pivot_table(df, index='key1', columns='key2')
| data1 | data2 |
---|
key2 | one | two | one | two |
---|
key1 | | | | |
---|
a | 0.280686 | -1.253387 | 1.149024 | -0.544606 |
---|
b | -1.273505 | -0.681958 | 1.017159 | -0.187279 |
---|
透视表pivot_table,单元格中的数值为同组数值的平均值
df.pivot_table(['data1'], index='key1',columns='key2')
| data1 |
---|
key2 | one | two |
---|
key1 | | |
---|
a | 0.280686 | -1.253387 |
---|
b | -1.273505 | -0.681958 |
---|
df
| data1 | data2 | key1 | key2 |
---|
0 | 1.244597 | 1.243988 | a | one |
---|
1 | -1.253387 | -0.544606 | a | two |
---|
2 | -1.273505 | 1.017159 | b | one |
---|
3 | -0.681958 | -0.187279 | b | two |
---|
4 | -0.683225 | 1.054060 | a | one |
---|
如果将参数margins设置为True,则可以得到分项总计数据。
df.pivot_table(index='key1',columns='key2', margins=True)
| data1 | data2 |
---|
key2 | one | two | All | one | two | All |
---|
key1 | | | | | | |
---|
a | 0.280686 | -1.253387 | -0.230672 | 1.149024 | -0.544606 | 0.584481 |
---|
b | -1.273505 | -0.681958 | -0.977731 | 1.017159 | -0.187279 | 0.414940 |
---|
All | -0.237378 | -0.967672 | -0.529496 | 1.105069 | -0.365942 | 0.516665 |
---|
按’key1’,’key2’进行分组,使用describe()列出详细信息
grouped2=df.groupby(['key1','key2'])
grouped2.describe()
| | data1 | data2 |
---|
| | count | mean | std | min | 25% | 50% | 75% | max | count | mean | std | min | 25% | 50% | 75% | max |
---|
key1 | key2 | | | | | | | | | | | | | | | | |
---|
a | one | 2.0 | 0.280686 | 1.363176 | -0.683225 | -0.201270 | 0.280686 | 0.762641 | 1.244597 | 2.0 | 1.149024 | 0.134299 | 1.054060 | 1.101542 | 1.149024 | 1.196506 | 1.243988 |
---|
two | 1.0 | -1.253387 | NaN | -1.253387 | -1.253387 | -1.253387 | -1.253387 | -1.253387 | 1.0 | -0.544606 | NaN | -0.544606 | -0.544606 | -0.544606 | -0.544606 | -0.544606 |
---|
b | one | 1.0 | -1.273505 | NaN | -1.273505 | -1.273505 | -1.273505 | -1.273505 | -1.273505 | 1.0 | 1.017159 | NaN | 1.017159 | 1.017159 | 1.017159 | 1.017159 | 1.017159 |
---|
two | 1.0 | -0.681958 | NaN | -0.681958 | -0.681958 | -0.681958 | -0.681958 | -0.681958 | 1.0 | -0.187279 | NaN | -0.187279 | -0.187279 | -0.187279 | -0.187279 | -0.187279 |
---|