# python/pandas数据挖掘（十四）-groupby,聚合，分组级运算

## groupby

import pandas as pd
df = pd.DataFrame({'key1':list('aabba'),
'key2': ['one','two','one','two','one'],
'data1': np.random.randn(5),
'data2': np.random.randn(5)})
df
• 1
• 2
• 3
• 4
• 5
• 6

grouped=df['data1'].groupby(df['key1'])
grouped.mean()
• 1
• 2

states=np.array(['Ohio','California','California','Ohio','Ohio'])
years=np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()
• 1
• 2
• 3

df.groupby('key1').mean()
• 1

## 对分组进行迭代

for name, group in df.groupby('key1'):
print (name)
print (group)
• 1
• 2
• 3

for (k1,k2),group in df.groupby(['key1','key2']):
print ('===k1,k2:')
print (k1,k2)
print ('===k3:')
print (group)
• 1
• 2
• 3
• 4
• 5

piece=dict(list(df.groupby('key1')))
piece

{'a':       data1     data2 key1 key2
0 -0.233405 -0.756316    a  one
1 -0.232103 -0.095894    a  two
4  1.056224  0.736629    a  one, 'b':       data1     data2 key1 key2
2  0.200875  0.598282    b  one
3 -1.437782  0.107547    b  two}

piece['a']
• 1
• 2
• 3
• 4
• 5
• 6
• 7
• 8
• 9
• 10
• 11
• 12

groupby默认是在axis=0上进行分组的，通过设置也可以在其他任何轴上进行分组.

grouped=df.groupby(df.dtypes, axis=1)
dict(list(grouped))

{dtype('float64'):       data1     data2
0 -0.233405 -0.756316
1 -0.232103 -0.095894
2  0.200875  0.598282
3 -1.437782  0.107547
4  1.056224  0.736629, dtype('O'):   key1 key2
0    a  one
1    a  two
2    b  one
3    b  two
4    a  one

• 1
• 2
• 3
• 4
• 5
• 6
• 7
• 8
• 9
• 10
• 11
• 12
• 13
• 14
• 15

## 选取一个或者一组列

df.groupby(['key1','key2'])[['data2']].mean()

• 1
• 2

## 通过字典或者series进行分组

people=pd.DataFrame(np.random.randn(5,5),
columns=list('abcde'),
index=['Joe','Steve','Wes','Jim','Travis'])

people.ix[2:3,['b','c']]=np.nan #设置几个nan
people
• 1
• 2
• 3
• 4
• 5
• 6

mapping={'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}

by_column=people.groupby(mapping,axis=1)

by_column.sum()
• 1
• 2
• 3
• 4
• 5

Series 也一样

map_series=pd.Series(mapping)
map_series

a       red
b       red
c      blue
d      blue
e       red
f    orange
dtype: object

people.groupby(map_series,axis=1).count()
• 1
• 2
• 3
• 4
• 5
• 6
• 7
• 8
• 9
• 10
• 11
• 12

## 通过函数进行分组


people.groupby(len).sum() a b c d e 3 -1.308709 -2.353354 1.585584 2.908360 -1.267162 5 -0.688506 -0.187575 -0.048742 1.491272 -0.636704 6 0.110028 -0.932493 1.343791 -1.928363 -0.364745
• 1
• 2

 key_list=['one','one','one','two','two'] people.groupby([len,key_list]).sum()
• 1

## 根据索引级别进行分组

columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])
hier_df=pd.DataFrame(np.random.randn(4,5),columns=columns)
hier_df
• 1
• 2
• 3

hier_df.groupby(level='cty',axis=1).count()
• 1

## 面向列的多函数应用

grouped=tips.groupby(['sex','smoker'])
grouped_pct=grouped['tip_pct'] #tip_pct列
grouped_pct.agg('mean')#对与9-1图标中描述的统计，可以将函数名直接以字符串传入

#如果传入一组函数，得到的df的列名就会以相应的函数命名
• 1
• 2
• 3
• 4
• 5

## 分组级运算和转换

transform会将一个函数应用到各个分组，然后将结果放在适当的位置. 如果各分组产生的标量值，则该标量值会被广播出去。

transform也是有严格条件的特殊函数：传入的函数只能产生两种结果，要么产生一个可以广播的标量值(如：np.mean), 要么产生一个相同大小的结果数组。

people=pd.DataFrame(np.random.randn(5,5),
columns=list('abcde'),
index=['Joe','Steve','Wes','Jim','Travis'])
people

• 1
• 2
• 3
• 4
• 5

key=['one','two','one','two','one']
people.groupby(key).mean()
• 1
• 2

people.groupby(key).transform(np.mean)
• 1

def demean(arr):
return arr-arr.mean()

demeaned=people.groupby(key).transform(demean)
demeaned
• 1
• 2
• 3
• 4
• 5
demeaned.groupby(key).mean()
• 1

tips=pd.read_csv('C:\\Users\\ecaoyng\\Desktop\\work space\\Python\\py_for_analysis_code\\pydata-book-master\\ch08\\tips.csv')
tips[:5]
• 1
• 2

tips['tip_pct']=tips['tip']/tips['total_bill']
tips[:6]
• 1
• 2

def top(df,n=5,column='tip_pct'):
return df.sort_index(by=column)[-n:]
top(tips,n=6)
• 1
• 2
• 3

tips.groupby('smoker').apply(top)
• 1

tips.groupby(['smoker','day']).apply(top,n=1,column='total_bill')
• 1

## 分位数和桶分析

cut and qcut与groupby结合起来，能轻松的对数据集的桶(bucket)或者分位数(quantile)分析。

frame=pd.DataFrame({'data1':np.random.randn(1000),
'data2': np.random.randn(1000)})
frame[:5]
• 1
• 2
• 3

factor=pd.cut(frame.data1,4)
factor[:10]

0    (0.281, 2.00374]
1    (0.281, 2.00374]
2    (-3.172, -1.442]
3     (-1.442, 0.281]
4    (0.281, 2.00374]
5    (0.281, 2.00374]
6     (-1.442, 0.281]
7     (-1.442, 0.281]
8     (-1.442, 0.281]
9     (-1.442, 0.281]
Name: data1, dtype: category
Categories (4, object): [(-3.172, -1.442] < (-1.442, 0.281] < (0.281, 2.00374] < (2.00374, 3.727]]
• 1
• 2
• 3
• 4
• 5
• 6
• 7
• 8
• 9
• 10
• 11
• 12
• 13
• 14
• 15
def get_stats(group):
return {'min':group.min(),'max':group.max(),'count':group.count(),'mean':group.mean()}
grouped=frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()
• 1
• 2
• 3
• 4

grouping=pd.qcut(frame.data1,10,labels=False)#label=false即可值获取分位数的编号
grouped=frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
• 1
• 2
• 3

©️2019 CSDN 皮肤主题: 大白 设计师: CSDN官方博客