Pandas_应用:通用分拆-应用-联合

1.应用

'''
选出小费比,最高的5组
'''
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
def top(df,n,column):
    return df.sort_values(by=column)[-n:]
tips = pd.read_csv('tips.csv')
grouped = tips.groupby(['day','smoker']).apply(top,5,'total_bill')
print(grouped)
'''
                 total_bill    tip     sex smoker   day    time  size
day  smoker                                                          
Fri  No     99        12.46   1.50    Male     No   Fri  Dinner     2
            223       15.98   3.00  Female     No   Fri   Lunch     3
            91        22.49   3.50    Male     No   Fri  Dinner     2
            94        22.75   3.25  Female     No   Fri  Dinner     2
     Yes    93        16.32   4.30  Female    Yes   Fri  Dinner     2
            98        21.01   3.00    Male    Yes   Fri  Dinner     2
            96        27.28   4.00    Male    Yes   Fri  Dinner     2
            90        28.97   3.00    Male    Yes   Fri  Dinner     2
            95        40.17   4.73    Male    Yes   Fri  Dinner     4
Sat  No     39        31.27   5.00    Male     No   Sat  Dinner     3
            238       35.83   4.67  Female     No   Sat  Dinner     3
            23        39.42   7.58    Male     No   Sat  Dinner     4
            59        48.27   6.73    Male     No   Sat  Dinner     4
            212       48.33   9.00    Male     No   Sat  Dinner     4
     Yes    237       32.83   1.17    Male    Yes   Sat  Dinner     2
            56        38.01   3.00    Male    Yes   Sat  Dinner     4
            207       38.73   3.00    Male    Yes   Sat  Dinner     4
            102       44.30   2.50  Female    Yes   Sat  Dinner     3
            170       50.81  10.00    Male    Yes   Sat  Dinner     3
'''

2.压缩分组键

#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
def top(df,n,column):
    return df.sort_values(by=column)[-n:]
tips = pd.read_csv('tips.csv')
'''
group_keys=False不用分组键作为索引
'''
grouped = tips.groupby(['day','smoker'],group_keys=False).apply(top,5,'total_bill')
print(grouped)
'''
     total_bill    tip     sex smoker   day    time  size
99        12.46   1.50    Male     No   Fri  Dinner     2
223       15.98   3.00  Female     No   Fri   Lunch     3
91        22.49   3.50    Male     No   Fri  Dinner     2
94        22.75   3.25  Female     No   Fri  Dinner     2
93        16.32   4.30  Female    Yes   Fri  Dinner     2
98        21.01   3.00    Male    Yes   Fri  Dinner     2
96        27.28   4.00    Male    Yes   Fri  Dinner     2
'''

3.分位数与桶分析

frame=pd.DataFrame({
    'data1':np.random.randn(1000),
    'data2':np.random.rand(1000)})
quarriles=pd.cut(frame['data1'],4,labels=False)
def get(group):
    return {
        'min':group.min(),
        'max':group.max(),
        'count':group.count()
    }
groupd=frame['data1'].groupby(quarriles)
groupd_1=groupd.apply(get).unstack()
print(groupd_1)
'''
       count       max       min
data1                           
0      101.0 -1.163979 -2.651598
1      508.0  0.323614 -1.147695
2      352.0  1.805476  0.326078
3       39.0  3.301849  1.824265
'''

4.指定分组的均指填充缺失值

index=['a0','a1','a2','a3','b0','b1','b2','b3']
group_key=['a']*4+['b']*4
frame=pd.DataFrame(np.arange(8),index=index,columns=['num'])
frame[1:2]=np.nan
frame[6:7]=np.nan
frame_1=frame.groupby(group_key,group_keys=False).apply(lambda x:x.fillna(x.mean()))
print(frame_1)
'''
         num
a0  0.000000
a1  1.666667
a2  2.000000
a3  3.000000
b0  4.000000
b1  5.000000
b2  5.333333
b3  7.000000
'''

5.分组加权平均与相关性

df=pd.DataFrame({
    'category':['a']*4+['b']*4,
    'data':np.random.randn(8),
    'weights':np.random.rand(8)
})
grouped=df.groupby('category')
get_avg=lambda x :np.average(x['data'],weights=x['weights'])
result=grouped.apply(get_avg)
print(result)
'''
category
a   -0.012118
b    0.157650
'''
close_px=pd.read_csv('stock_px.csv',index_col=0,parse_dates=True)

#计算当前元素和先前元素之间的百分比变化,然后丢失缺失值
rets=close_px.pct_change().dropna()

# 计算每一列与'SPX'的相关系数
spx_corr = lambda x : x.corrwith(x['SPX'])

# 按年份分组
by_year = rets.groupby(lambda x:x.year)
# 按月分组
#by_year = rets.groupby(lambda x:x.month)

# 按年份计算各股票与SPX的相关系数
result=by_year.apply(spx_corr)
print(result)
'''
          AAPL      MSFT       XOM  SPX
2003  0.541124  0.745174  0.661265  1.0
2004  0.374283  0.588531  0.557742  1.0
2005  0.467540  0.562374  0.631010  1.0
2006  0.428267  0.406126  0.518514  1.0
2007  0.508118  0.658770  0.786264  1.0
2008  0.681434  0.804626  0.828303  1.0
2009  0.707103  0.654902  0.797921  1.0
2010  0.710105  0.730118  0.839057  1.0
2011  0.691931  0.800996  0.859975  1.0
'''

# 任意两股票的相关系数如下计算
result2=by_year.apply( lambda g : g['AAPL'].corr(g['MSFT']))
print(result2)
'''
2003    0.480868
2004    0.259024
2005    0.300093
2006    0.161735
2007    0.417738
2008    0.611901
2009    0.432738
2010    0.571946
2011    0.581987
'''

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值