#encoding:utf8
import numpy as np
import pandas as pd
'''
分组计算:
拆分:根据什么进行分组
应用:每个分组进行怎样的计算
合并:每个分组的计算结果合并起来
'''
df = pd.DataFrame(
{
'key1':['a','a','b','b','a'],
'key2':['one','two','one','two','one'],
'data1':np.random.randint(1,10,5),
'data2':np.random.randint(1,10,5),
}
)
print(df)
'''
data1 data2 key1 key2
0 3 9 a one
1 3 5 a two
2 7 3 b one
3 1 1 b two
4 2 1 a one
'''
print(df['data1'])
'''
0 7
1 7
2 4
3 1
4 8
'''
#计算data1列按照key1字段聚合求平均值
print(df['data1'].groupby(df['key1']).mean())
'''
data1 data2 key1 key2
0 1 3 a one
1 4 4 a two
2 3 9 b one
3 7 3 b two
4 1 3 a one
key1
a 2
b 5
'''
#除了可以在数据内按照聚合,也可以自定义聚合
#这里列表中元素表示data1列的元素位置聚合
#data1列下的第1,3,4个元素聚合,第2,5个元素聚合,然后求平均值
key = [1,2,1,1,2]
print(df)
print(df['data1'].groupby(key).mean())
'''
data1 data2 key1 key2
0 9 9 a one
1 5 2 a two
2 4 8 b one
3 2 4 b two
4 1 3 a one
1 5
2 3
'''
#分组也可以是多级列表
#在groupby中按照元素索引顺序进行排组的依据先后
print(df)
print(df['data1'].groupby([df['key1'],df['key2']]).sum())
print(df['data1'].groupby([df['key2'],df['key1']]).sum())
'''
data1 data2 key1 key2
0 9 8 a one
1 3 4 a two
2 7 8 b one
3 4 1 b two
4 3 7 a one
a[9,3,3]---->one[9,3]....
key1 key2
a one 12
two 3
b one 7
two 4
one[9,7,3]---->a[9,3]....
key2 key1
one a 12
b 7
two a 3
b 4
'''
#查看分组的个数
print(df['data1'].groupby([df['key1'],df['key2']]).size())
'''
key1 key2
a one 2
two 1
b one 1
two 1
'''
#按照key1进行分组
#生成的是一个DataFrame
print(df.groupby('key1').sum())
'''
data1 data2
key1
a 16 12
b 10 6
'''
#对分组之后可进行索引的选取
print(df.groupby('key1').sum()['data1'])
'''
key1
a 14
b 15
'''
#当然也可以进行多级分组,然后转换为DataFrame
mean = df.groupby(['key1','key2']).sum()
print(mean)
print(mean.unstack())
'''
data1 data2
key1 key2
a one 17 6
two 1 3
b one 7 3
two 1 6
data1 data2
key2 one two one two
key1
a 17 1 6 3
b 7 1 3 6
'''
#groupby支持迭代
for name,group in df.groupby('key1'):
print(name)
print(group)
'''
a
data1 data2 key1 key2
0 9 8 a one
1 8 8 a two
4 7 7 a one
b
data1 data2 key1 key2
2 6 1 b one
3 5 8 b two
'''
#也可以对groupby转换为字典
print(dict(list(df.groupby('key1'))))
print(dict(list(df.groupby('key1')))['a'])
print(dict(list(df.groupby('key1')))['b'])
'''
{'a': data1 data2 key1 key2
0 9 4 a one
1 4 3 a two
4 5 6 a one, 'b': data1 data2 key1 key2
2 3 7 b one
3 2 8 b two}
data1 data2 key1 key2
0 9 4 a one
1 4 3 a two
4 5 6 a one
data1 data2 key1 key2
2 3 7 b one
3 2 8 b two
'''
#按照列类型进行分组
print(df.groupby(df.dtypes,axis=1).sum())
'''
0 4 aone
1 15 atwo
2 10 bone
3 8 btwo
4 18 aone
'''
#以上都是按照列表进行分组
#下面用其他分组形式来进行分组
#通过字典进行分组
df = pd.DataFrame(
np.random.randint(1,10,(5,5)),
columns=list('abcde'),
index=['Alice','Bob','Candy','Dark','Emily']
)
#看一下处理非数字
df.ix[1,1:3] = np.NaN
print(df)
'''
a b c d e
Alice 3 3.0 7.0 7 9
Bob 4 NaN NaN 3 4
Candy 9 5.0 1.0 4 1
Dark 6 3.0 9.0 9 9
Emily 8 4.0 2.0 3 6
'''
mapping = {
'a':'red',
'b':'red',
'c':'blue',
'd':'orange',
'e':'blue'
}
grouped = df.groupby(mapping,axis=1)
print(grouped.sum())
'''
a b c d e
Alice 4 5.0 2.0 2 2
Bob 3 NaN NaN 8 1
Candy 6 7.0 6.0 9 7
Dark 7 8.0 1.0 4 3
Emily 8 6.0 8.0 3 3
blue orange red
Alice 4.0 2.0 9.0
Bob 1.0 8.0 3.0
Candy 13.0 9.0 13.0
Dark 4.0 4.0 15.0
Emily 11.0 3.0 14.0
可以看对Nan和数字分组计算是按照Nan=0来处理的
'''
print(grouped.size())
print(grouped.count())
'''
blue 2
orange 1
red 2
blue orange red
Alice 2 1 2
Bob 1 1 1
Candy 2 1 2
Dark 2 1 2
Emily 2 1 2
Nan是没有统计个数的
'''
#通过函数进行分组
df = pd.DataFrame(
np.random.randint(1,10,(5,5)),
columns=list('abcde'),
index=['Alice','Bob','Candy','Dark','Emily']
)
#按照行索引
def _group_by(idx):
print(idx)
return idx
print(df.groupby(_group_by).size())
print(df.groupby(_group_by).count())
'''
Alice
Bob
Candy
Dark
Emily
a b c d e
Alice 1 1 1 1 1
Bob 1 1 1 1 1
Candy 1 1 1 1 1
Dark 1 1 1 1 1
Emily 1 1 1 1 1
按照行来进行分组的
'''
#按照行索引长度
def _group_by2(idx):
print(idx)
return len(idx)
print(df.groupby(_group_by2).size())
print(df.groupby(_group_by2).count())
'''
Alice
Bob
Candy
Dark
Emily
a b c d e
3 1 1 1 1 1
4 1 1 1 1 1
5 3 3 3 3 3
'''
#通过索引级别进行分组
columns = pd.MultiIndex.from_arrays(
[
['China','USA','China','USA','China'],
['A','A','B','C','B']
],
names=['country','index']
)
df = pd.DataFrame(np.random.randint(1,10,(5,5)),columns=columns)
print(df)
'''
country China USA China USA China
index A A B C B
0 9 5 4 5 2
1 8 5 4 6 9
2 2 7 3 1 3
3 2 1 6 5 5
4 5 3 6 9 4
'''
print(df.groupby(level='country',axis=1).sum())
'''
country China USA
0 17 12
1 19 12
2 19 8
3 21 13
4 18 9
'''
print(df.groupby(level='index',axis=1).sum())
'''
index A B C
0 7 7 6
1 11 12 3
2 12 11 9
3 14 11 4
4 17 15 8
'''