Groupby技术,数据聚合,透视表和交叉表

from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
from io import StringIO

# StringIO模块主要用于在内存缓冲区中读写数据。模块是用类编写的,只有一个StringIO类,
# 所以它的可用方法都在类中。此类中的大部分函数都与对文件的操作方法类似。

np.random.seed(12345)
plt.rc('figure',figsize=(10,6))
from pandas import Series,DataFrame
import pandas as pd

np.set_printoptions(precision=4)

pd.options.display.notebook_repr_html=False #当为True时,IPython笔记本将为pandas对象使用html表示(如果可用)。
get_ipython().magic(u'matplotlib inline')
#GroupBy技术

df=DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
            'key2' : ['one', 'two', 'one', 'two', 'one'],
            'data1' : np.random.randn(5),
            'data2' : np.random.randn(5) })
grouped=df['data1'].groupby(df['key1'])#根据key1进行分组,取data1的值
sorted(grouped) 

grouped.mean() #a,b两组的均值

means=df['data1'].groupby([df['key1'],df['key2']]).mean() #根据key1,key2进行分组,求data1的均值
means
#另一种表示方式
means_copy=df.groupby('key1')['data1'].mean()
means_copy

means_copy1=df.groupby(['key1','key2'])['data1'].mean().sort_values(ascending=False).to_frame().reset_index()
means_copy1


#对分组进行迭代

for name,group in df.groupby('key1'):
    print({name:group})

for (k1,k2),group in df.groupby(['key1','key2']):
    print((k1,k2))
    print(group)

pieces=dict(list(df.groupby('key1')))
pieces

grouped = df.groupby(df.dtypes, axis=1)
dict(list(grouped))

df.dtypes

#选择一个或一组列

df.groupby('key1')['data1']#产生的是Series
df.groupby('key1')[['data2']]#产生的是DataFrame
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])

df.groupby(['key1','key2'])[['data2']].mean()

s_grouped=df.groupby(['key1','key2'])['data2'].mean()
s_grouped

#通过字典或series进行分组

people=DataFrame(np.random.randn(5,5),
                 columns=['a', 'b', 'c', 'd', 'e'],
                index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.loc[0:1,['a','b']]=np.nan
people

mapping={'a': 'red', 'b': 'red', 'c': 'blue',
           'd': 'blue', 'e': 'red', 'f' : 'orange'}
by_column=people.groupby(mapping,axis=1)
by_column.sum()

map_series=Series(mapping)
map_series

people.groupby(map_series,axis=1).count()

dict(list(by_column))

#通过函数进行分组

people.groupby(len).sum()

key_list=['one','one','one','two','two']
people.groupby([len,key_list]).min()

key_list=['one','one','one','two','two']
people.groupby([len,key_list]).sum()

#通过索引进行分组

columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])
hier_df=DataFrame(np.random.randn(4,5),columns=columns)
hier_df

hier_df.groupby(level='cty',axis=1).count()

#level,axis必下,否则出错
#通过level关键字出入级别编号或者名称即可:

 

#数据聚合

df=DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
            'key2' : ['one', 'two', 'one', 'two', 'one'],
            'data1' : np.random.randn(5),
            'data2' : np.random.randn(5) })

df.groupby('key1')['data1'].quantile(0.9)

grouped=df.groupby('key1')
def peak_to_peak(arr):
    return arr.max()-arr.min()
grouped.agg(peak_to_peak)#agg函数内调用的函数只能对分组进行聚合使用

#面向列的多函数应用agg()
grouped_pct.agg([('foo','mean'),('bar',np.std)])#为新列mean和std命名为foo,bar

functions=['count','mean','max']
result=grouped['tip_pct','total_bill'].agg(functions)
result

ftuples=[('Durchschnitt','mean'),('Abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)

grouped.agg({'tip':np.max,'size':'sum'})

grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})


#分组级运算和转换
k1_means=df.groupby('key1').mean().add_prefix('mean_')
pd.merge(df,k1_means,left_on='key1',right_index=True)


key=['one','two','one','two','one']
people.groupby(key).mean()

def demean(arr):
    return arr-arr.mean()

demeaned=people.groupby(key).apply(demean)
demeaned

demeaned1=people.groupby(key).transform(demean)
demeaned1

#apply方法
def top(df,n=5,column='tip_pct'):
    return df.sort_values(by=column)[-n:]
top(tips,n=6)

tips.groupby('smoker').apply(top)
tips.groupby(['sex','smoker']).apply(top,n=1,column='total_bill')

result=tips.groupby('smoker')['tip_pct'].describe()
result
result.unstack('smoker')
result.stack()

f=lambda x:x.describe()
tips.groupby('smoker').apply(f)

#禁止分组健
tips.groupby('smoker',group_keys=False).apply(top)

#分位数和桶分析
frame=DataFrame({'data1':np.random.randn(1000),
                'data2':np.random.randn(1000)})
def get_stats(group):
    return {'min':group.min(),'max': group.max(),
           'count': group.count(), 'mean': group.mean()}

grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()

grouping = pd.qcut(frame.data1, 10, labels=False)
grouping[:10]
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()

#用特定于分组的值填充缺失值
s=Series(np.random.randn(6))
s[::2]=np.nan
s=s.fillna(s.mean())#平均值替换缺失值

states = ['Ohio', 'New York', 'Vermont', 'Florida',
          'Oregon', 'Nevada', 'California', 'Idaho']
group_key=['East']*4+['West']*4
data=Series(np.random.randn(8),index=states)
data[['Vermont', 'Nevada', 'Idaho']]=np.nan
data.groupby(group_key).mean()
fill_mean=lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)

fill_values={'East':0.5,'West':-1}
fill_func=lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)

#随机采样和排列(制作扑克牌)
suits=['H','S','C','D']#黑桃,红桃,方块,梅花
card_val=(list(range(1,11))+[10]*3)*4 #[1,2,3,4,5,6,7,8,9,10,10,10,10]*4
base_names=['A']+list(range(2,11))+['J','K','Q'] #[A,2,3,4,5,6,7,8,9,10,J,K,Q]
cards=[]#存储扑克牌,4种花色,共52张
for suit in ['H','S','C','D']:
    cards.extend(str(num)+suit for num in base_names)#extend将一个列表添加到已有列表中,与append不同。将扑克牌号与花色连接
deck=Series(card_val,index=cards)#序列
deck[:13],'\n'

def draw(deck,n=5): 
#从排中抽取5张,注意抽取方式,是一种随机选取5个的方式,即先选出一个排列,再从中拿出5个
    return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)

get_suit=lambda card:card[-1]#只要最后一个字母
deck.groupby(get_suit).apply(draw,n=2)

#分组加权平均数和相关系数
df=DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],
                'data': np.random.randn(8),
                'weights': np.random.rand(8)})
grouped=df.groupby('category')
get_wavg=lambda g:np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)

grouped.agg(get_wavg)

close_px=pd.read_csv('E:/python/wangyiPython/the eight week/spx.csv',parse_dates=True,index_col=0)
close_px.info()
rets=close_px.pct_change().dropna()
#pct_change是计算每列下一个数值相对于上一个值的百分比变化,所以,第一个肯定为NaN
spx_corr=lambda x:x.corrwith(x['SPX'])
by_year=rets.groupby(lambda x:x.year)
by_year.apply(spx_corr)

#透视表
tips.pivot_table(index=['sex','smoker'])
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker')
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker',margins=True)
tips.pivot_table('size',index=['sex','smoker'],columns='day',aggfunc=len,margins=True)
tips.pivot_table('size',index=['time','sex','smoker'],columns='day',aggfunc='sum',fill_value=0)

#交叉表
data = """Sample    Gender    Handedness
1    Female    Right-handed
2    Male    Left-handed
3    Female    Right-handed
4    Male    Right-handed
5    Male    Left-handed
6    Male    Right-handed
7    Female    Right-handed
8    Female    Left-handed
9    Male    Right-handed
10    Female    Right-handed"""
data=pd.read_table(StringIO(data),sep='\s+')
data
out:
Sample  Gender    Handedness
0       1  Female  Right-handed
1       2    Male   Left-handed
2       3  Female  Right-handed
3       4    Male  Right-handed
4       5    Male   Left-handed
5       6    Male  Right-handed
6       7  Female  Right-handed
7       8  Female   Left-handed
8       9    Male  Right-handed
9      10  Female  Right-handed

pd.crosstab(data.Gender,data.Handedness,margins=True) #按Gender分组,统计Handedness的频数
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值