聚合运算1

本文详细介绍使用Python的Pandas库进行数据分组与聚合操作的方法,包括基本的sum、mean、describe等统计函数,自定义函数的使用,以及如何通过groupby方法结合agg函数实现复杂的数据分析需求。此外,还探讨了如何将分组后的统计数据加入原始数据集,以及如何应用转换函数进行数据标准化。
摘要由CSDN通过智能技术生成
import pandas as pd
import numpy as np
df = pd.DataFrame({"key1":["a","a","b","b","a"],
                  "key2":["one","two","one","two","one"],
                  "data1":np.random.randint(1,10,5),
                  "data2":np.random.randint(1,10,5)})
df
data1data2key1key2
095aone
126atwo
229bone
374btwo
447aone
df.groupby("key1").sum()
data1data2
key1
a1518
b913
df.groupby("key1").describe()
data1data2
key1
acount3.0000003.000000
mean5.0000006.000000
std3.6055511.000000
min2.0000005.000000
25%3.0000005.500000
50%4.0000006.000000
75%6.5000006.500000
max9.0000007.000000
bcount2.0000002.000000
mean4.5000006.500000
std3.5355343.535534
min2.0000004.000000
25%3.2500005.250000
50%4.5000006.500000
75%5.7500007.750000
max7.0000009.000000
grouped = df.groupby("key1")
def peak_range(s):
    print(type(s))
    return s.max() - s.min()
grouped.agg(peak_range)
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
data1data2
key1
a72
b55
grouped.agg(["std","mean","sum",peak_range])
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
data1data2
stdmeansumpeak_rangestdmeansumpeak_range
key1
a3.6055515.01571.0000006.0182
b3.5355344.5953.5355346.5135
grouped.agg(["std","mean","sum",("range", peak_range)])
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
data1data2
stdmeansumrangestdmeansumrange
key1
a3.6055515.01571.0000006.0182
b3.5355344.5953.5355346.5135
d = {"data1":"mean",
    "data2":"sum"}
grouped.agg(d)
data1data2
key1
a5.018
b4.513
d = {"data1":["mean",("range", peak_range)],
    "data2":"sum"}
grouped.agg(d)
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
data1data2
meanrangesum
key1
a5.0718
b4.5513
grouped.agg(d).reset_index()
<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
key1data1data2
meanrangesum
0a5.0718
1b4.5513
df = pd.DataFrame({"key1":["a","a","b","b","a"],
                  "key2":["one","two","one","two","one"],
                  "data1":np.random.randint(1,10,5),
                  "data2":np.random.randint(1,10,5)})
df
data1data2key1key2
062aone
114atwo
217bone
364btwo
447aone
kl_mean = df.groupby("key1").mean().add_prefix("mean_")
kl_mean
mean_data1mean_data2
key1
a3.6666674.333333
b3.5000005.500000
pd.merge(df,kl_mean, left_on="key1", right_index=True)
data1data2key1key2mean_data1mean_data2
062aone3.6666674.333333
114atwo3.6666674.333333
447aone3.6666674.333333
217bone3.5000005.500000
364btwo3.5000005.500000
kl_mean = df.groupby("key1").transform(np.mean).add_prefix("mean_")
kl_mean
mean_data1mean_data2
03.6666674.333333
13.6666674.333333
23.5000005.500000
33.5000005.500000
43.6666674.333333
df[kl_mean.columns] = kl_mean
df
data1data2key1key2mean_data1mean_data2
062aone3.6666674.333333
114atwo3.6666674.333333
217bone3.5000005.500000
364btwo3.5000005.500000
447aone3.6666674.333333
df = pd.DataFrame(np.random.randint(1,10,(5,5)),
                 columns=["a","b","c","d","e"],
                 index=["alice","bob","candy","dark","emily"])
df
abcde
alice98542
bob28336
candy94448
dark76817
emily24571
def demean(s):
    return s - s.mean()
key = ["one","one","two","one","two"]
demeaned = df.groupby(key).transform(demean)
demeaned
abcde
alice3.00.666667-0.3333331.333333-3.0
bob-4.00.666667-2.3333330.3333331.0
candy3.50.000000-0.500000-1.5000003.5
dark1.0-1.3333332.666667-1.6666672.0
emily-3.50.0000000.5000001.500000-3.5
states = ["ohio","new york","vermont","florida",
         "oregon","nevada","california","idaho"]
group_key = ["east"]*4 + ["west"]*4
data = pd.Series(np.random.randint(8),index = states)
data[["vermont","nevada","idaho"]] = np.nan
data
ohio          7.0
new york      7.0
vermont       NaN
florida       7.0
oregon        7.0
nevada        NaN
california    7.0
idaho         NaN
dtype: float64
data.groupby(group_key).mean()
east    7.0
west    7.0
dtype: float64
data.groupby(group_key).apply(lambda g: g.fillna(g.mean()))
ohio          7.0
new york      7.0
vermont       7.0
florida       7.0
oregon        7.0
nevada        7.0
california    7.0
idaho         7.0
dtype: float64
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值