from __future__ import division
from numpy.random import randn
import numpy as np
import os
import matplotlib.pyplot as plt
from io import StringIO
# StringIO模块主要用于在内存缓冲区中读写数据。模块是用类编写的,只有一个StringIO类,
# 所以它的可用方法都在类中。此类中的大部分函数都与对文件的操作方法类似。
np.random.seed(12345)
plt.rc('figure',figsize=(10,6))
from pandas import Series,DataFrame
import pandas as pd
np.set_printoptions(precision=4)
pd.options.display.notebook_repr_html=False #当为True时,IPython笔记本将为pandas对象使用html表示(如果可用)。
get_ipython().magic(u'matplotlib inline')
#GroupBy技术
df=DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : np.random.randn(5),
'data2' : np.random.randn(5) })
grouped=df['data1'].groupby(df['key1'])#根据key1进行分组,取data1的值
sorted(grouped)
grouped.mean() #a,b两组的均值
means=df['data1'].groupby([df['key1'],df['key2']]).mean() #根据key1,key2进行分组,求data1的均值
means
#另一种表示方式
means_copy=df.groupby('key1')['data1'].mean()
means_copy
means_copy1=df.groupby(['key1','key2'])['data1'].mean().sort_values(ascending=False).to_frame().reset_index()
means_copy1
#对分组进行迭代
for name,group in df.groupby('key1'):
print({name:group})
for (k1,k2),group in df.groupby(['key1','key2']):
print((k1,k2))
print(group)
pieces=dict(list(df.groupby('key1')))
pieces
grouped = df.groupby(df.dtypes, axis=1)
dict(list(grouped))
df.dtypes
#选择一个或一组列
df.groupby('key1')['data1']#产生的是Series
df.groupby('key1')[['data2']]#产生的是DataFrame
df['data1'].groupby(df['key1'])
df[['data2']].groupby(df['key1'])
df.groupby(['key1','key2'])[['data2']].mean()
s_grouped=df.groupby(['key1','key2'])['data2'].mean()
s_grouped
#通过字典或series进行分组
people=DataFrame(np.random.randn(5,5),
columns=['a', 'b', 'c', 'd', 'e'],
index=['Joe', 'Steve', 'Wes', 'Jim', 'Travis'])
people.loc[0:1,['a','b']]=np.nan
people
mapping={'a': 'red', 'b': 'red', 'c': 'blue',
'd': 'blue', 'e': 'red', 'f' : 'orange'}
by_column=people.groupby(mapping,axis=1)
by_column.sum()
map_series=Series(mapping)
map_series
people.groupby(map_series,axis=1).count()
dict(list(by_column))
#通过函数进行分组
people.groupby(len).sum()
key_list=['one','one','one','two','two']
people.groupby([len,key_list]).min()
key_list=['one','one','one','two','two']
people.groupby([len,key_list]).sum()
#通过索引进行分组
columns=pd.MultiIndex.from_arrays([['US','US','US','JP','JP'],[1,3,5,1,3]],names=['cty','tenor'])
hier_df=DataFrame(np.random.randn(4,5),columns=columns)
hier_df
hier_df.groupby(level='cty',axis=1).count()
#level,axis必下,否则出错
#通过level关键字出入级别编号或者名称即可:
#数据聚合
df=DataFrame({'key1':['a', 'a', 'b', 'b', 'a'],
'key2' : ['one', 'two', 'one', 'two', 'one'],
'data1' : np.random.randn(5),
'data2' : np.random.randn(5) })
df.groupby('key1')['data1'].quantile(0.9)
grouped=df.groupby('key1')
def peak_to_peak(arr):
return arr.max()-arr.min()
grouped.agg(peak_to_peak)#agg函数内调用的函数只能对分组进行聚合使用
#面向列的多函数应用agg()
grouped_pct.agg([('foo','mean'),('bar',np.std)])#为新列mean和std命名为foo,bar
functions=['count','mean','max']
result=grouped['tip_pct','total_bill'].agg(functions)
result
ftuples=[('Durchschnitt','mean'),('Abweichung',np.var)]
grouped['tip_pct','total_bill'].agg(ftuples)
grouped.agg({'tip':np.max,'size':'sum'})
grouped.agg({'tip_pct':['min','max','mean','std'],'size':'sum'})
#分组级运算和转换
k1_means=df.groupby('key1').mean().add_prefix('mean_')
pd.merge(df,k1_means,left_on='key1',right_index=True)
key=['one','two','one','two','one']
people.groupby(key).mean()
def demean(arr):
return arr-arr.mean()
demeaned=people.groupby(key).apply(demean)
demeaned
demeaned1=people.groupby(key).transform(demean)
demeaned1
#apply方法
def top(df,n=5,column='tip_pct'):
return df.sort_values(by=column)[-n:]
top(tips,n=6)
tips.groupby('smoker').apply(top)
tips.groupby(['sex','smoker']).apply(top,n=1,column='total_bill')
result=tips.groupby('smoker')['tip_pct'].describe()
result
result.unstack('smoker')
result.stack()
f=lambda x:x.describe()
tips.groupby('smoker').apply(f)
#禁止分组健
tips.groupby('smoker',group_keys=False).apply(top)
#分位数和桶分析
frame=DataFrame({'data1':np.random.randn(1000),
'data2':np.random.randn(1000)})
def get_stats(group):
return {'min':group.min(),'max': group.max(),
'count': group.count(), 'mean': group.mean()}
grouped = frame.data2.groupby(factor)
grouped.apply(get_stats).unstack()
grouping = pd.qcut(frame.data1, 10, labels=False)
grouping[:10]
grouped = frame.data2.groupby(grouping)
grouped.apply(get_stats).unstack()
#用特定于分组的值填充缺失值
s=Series(np.random.randn(6))
s[::2]=np.nan
s=s.fillna(s.mean())#平均值替换缺失值
states = ['Ohio', 'New York', 'Vermont', 'Florida',
'Oregon', 'Nevada', 'California', 'Idaho']
group_key=['East']*4+['West']*4
data=Series(np.random.randn(8),index=states)
data[['Vermont', 'Nevada', 'Idaho']]=np.nan
data.groupby(group_key).mean()
fill_mean=lambda g:g.fillna(g.mean())
data.groupby(group_key).apply(fill_mean)
fill_values={'East':0.5,'West':-1}
fill_func=lambda g:g.fillna(fill_values[g.name])
data.groupby(group_key).apply(fill_func)
#随机采样和排列(制作扑克牌)
suits=['H','S','C','D']#黑桃,红桃,方块,梅花
card_val=(list(range(1,11))+[10]*3)*4 #[1,2,3,4,5,6,7,8,9,10,10,10,10]*4
base_names=['A']+list(range(2,11))+['J','K','Q'] #[A,2,3,4,5,6,7,8,9,10,J,K,Q]
cards=[]#存储扑克牌,4种花色,共52张
for suit in ['H','S','C','D']:
cards.extend(str(num)+suit for num in base_names)#extend将一个列表添加到已有列表中,与append不同。将扑克牌号与花色连接
deck=Series(card_val,index=cards)#序列
deck[:13],'\n'
def draw(deck,n=5):
#从排中抽取5张,注意抽取方式,是一种随机选取5个的方式,即先选出一个排列,再从中拿出5个
return deck.take(np.random.permutation(len(deck))[:n])
draw(deck)
get_suit=lambda card:card[-1]#只要最后一个字母
deck.groupby(get_suit).apply(draw,n=2)
#分组加权平均数和相关系数
df=DataFrame({'category': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'],
'data': np.random.randn(8),
'weights': np.random.rand(8)})
grouped=df.groupby('category')
get_wavg=lambda g:np.average(g['data'],weights=g['weights'])
grouped.apply(get_wavg)
grouped.agg(get_wavg)
close_px=pd.read_csv('E:/python/wangyiPython/the eight week/spx.csv',parse_dates=True,index_col=0)
close_px.info()
rets=close_px.pct_change().dropna()
#pct_change是计算每列下一个数值相对于上一个值的百分比变化,所以,第一个肯定为NaN
spx_corr=lambda x:x.corrwith(x['SPX'])
by_year=rets.groupby(lambda x:x.year)
by_year.apply(spx_corr)
#透视表
tips.pivot_table(index=['sex','smoker'])
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker')
tips.pivot_table(['tip_pct','size'],index=['sex','day'],columns='smoker',margins=True)
tips.pivot_table('size',index=['sex','smoker'],columns='day',aggfunc=len,margins=True)
tips.pivot_table('size',index=['time','sex','smoker'],columns='day',aggfunc='sum',fill_value=0)
#交叉表
data = """Sample Gender Handedness
1 Female Right-handed
2 Male Left-handed
3 Female Right-handed
4 Male Right-handed
5 Male Left-handed
6 Male Right-handed
7 Female Right-handed
8 Female Left-handed
9 Male Right-handed
10 Female Right-handed"""
data=pd.read_table(StringIO(data),sep='\s+')
data
out:
Sample Gender Handedness
0 1 Female Right-handed
1 2 Male Left-handed
2 3 Female Right-handed
3 4 Male Right-handed
4 5 Male Left-handed
5 6 Male Right-handed
6 7 Female Right-handed
7 8 Female Left-handed
8 9 Male Right-handed
9 10 Female Right-handed
pd.crosstab(data.Gender,data.Handedness,margins=True) #按Gender分组,统计Handedness的频数
pd.crosstab([tips.time,tips.day],tips.smoker,margins=True)