pandas—数据聚合与分组运算

In [1]:
import pandas as pd
import numpy as np
from pandas import *
df=DataFrame({
         'key1':['a','a','b','b','a'],
             'key2':['one','two','one','two','one'],
             'data1':np.random.randn(5),
             'data2':np.random.randn(5)})
df
Out[1]:
  data1 data2 key1 key2
0 1.711100 1.265615 a one
1 -0.937672 0.505905 a two
2 0.332802 0.544956 b one
3 -0.888079 0.024554 b two
4 0.542374 -0.455125 a one
In [4]:
grouped=df['data1'].groupby(df['key1'])
print(grouped)   #分组对象
grouped.mean()
<pandas.core.groupby.SeriesGroupBy object at 0x00000223D6390208>
Out[4]:
key1
a    0.249353
b   -0.092150
Name: data1, dtype: float64
In [6]:
means=df['data1'].groupby([df['key1'],df['key2']]).mean()
means
Out[6]:
key1  key2
a     one     0.507231
      two    -0.266402
b     one    -1.247499
      two     1.063199
Name: data1, dtype: float64
In [7]:
means.unstack()    #层次化索引!
Out[7]:
key2 one two
key1    
a 0.507231 -0.266402
b -1.247499 1.063199
In [8]:
states=np.array(['Ohio','California','California','Ohio','Ohio'])
years=np.array([2005,2005,2006,2005,2006])
df['data1'].groupby([states,years]).mean()
Out[8]:
California  2005   -0.266402
            2006   -1.247499
Ohio        2005    0.607577
            2006    0.862507
Name: data1, dtype: float64
In [9]:
df
Out[9]:
  data1 data2 key1 key2
0 0.151954 -0.834709 a one
1 -0.266402 0.469696 a two
2 -1.247499 1.402765 b one
3 1.063199 0.650468 b two
4 0.862507 -1.519559 a one
In [14]:
print(df.groupby('key1').mean())
print(df.groupby(['key1','key2']).mean())
df.groupby(['key1','key2']).size()           #size()用法
         data1     data2
key1                    
a     0.249353 -0.628191
b    -0.092150  1.026616
              data1     data2
key1 key2                    
a    one   0.507231 -1.177134
     two  -0.266402  0.469696
b    one  -1.247499  1.402765
     two   1.063199  0.650468
Out[14]:
key1  key2
a     one     2
      two     1
b     one     1
      two     1
dtype: int64
In [16]:
for name,group in df.groupby('key1'):
    print (name)
    print (group)    #查看分组结果
a
      data1     data2 key1 key2
0  0.151954 -0.834709    a  one
1 -0.266402  0.469696    a  two
4  0.862507 -1.519559    a  one
b
      data1     data2 key1 key2
2 -1.247499  1.402765    b  one
3  1.063199  0.650468    b  two
In [18]:
for (k1,k2),group in df.groupby(['key1','key2']):    #根据两个键分组
    print (k1,k2)
    print (group) 
a one
      data1     data2 key1 key2
0  0.151954 -0.834709    a  one
4  0.862507 -1.519559    a  one
a two
      data1     data2 key1 key2
1 -0.266402  0.469696    a  two
b one
      data1     data2 key1 key2
2 -1.247499  1.402765    b  one
b two
      data1     data2 key1 key2
3  1.063199  0.650468    b  two
In [20]:
pieces=dict(list(df.groupby('key1')))    #可以将分组结果以字典的形式保存
print(pieces['a'])
print(pieces['b'])
      data1     data2 key1 key2
0  0.151954 -0.834709    a  one
1 -0.266402  0.469696    a  two
4  0.862507 -1.519559    a  one
      data1     data2 key1 key2
2 -1.247499  1.402765    b  one
3  1.063199  0.650468    b  two
In [21]:
df.groupby(['key1','key2'])[['data2']].mean()  #只看data2
Out[21]:
    data2
key1 key2  
a one -1.177134
two 0.469696
b one 1.402765
two 0.650468
In [22]:
people=DataFrame(np.random.randn(5,5),
         columns=['a','b','c','d','e'],
         index=['Joe','Steve','Wes','Jim','Travis'])
people.ix[2:3,['b','c']]=np.nan
people
D:\Anaconda3\lib\site-packages\ipykernel_launcher.py:4: DeprecationWarning: 
.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  after removing the cwd from sys.path.
Out[22]:
  a b c d e
Joe 0.523195 0.625608 -1.833872 0.102788 -0.080301
Steve -1.789058 1.342658 1.592765 -0.452598 -0.651140
Wes 0.217548 NaN NaN 2.802176 -1.738924
Jim -0.655526 -1.468537 -1.281012 0.162456 -0.558902
Travis 0.051424 -1.124781 1.375842 0.562162 -0.032887
In [24]:
mapping={
         'a':'red','b':'red','c':'blue','d':'blue','e':'red','f':'orange'}
by_column=people.groupby(mapping,axis=1)
by_column
by_column.sum()
Out[24]:
  blue red
Joe -1.731084 1.068502
Steve 1.140167 -1.097540
Wes 2.802176 -1.521376
Jim -1.118556 -2.682965
Travis 1.938004 -1.106244
In [25]:
#通过函数进行分组
people
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值