分组聚合
分组
groupby原理
import numpy as np
import pandas as pd
df = pd.DataFrame(data = {'sex':np.random.randint(0,2,size = 300),
'class':np.random.randint(1,9,size = 300),
'Python':np.random.randint(0,151,size = 300),
'Keras':np.random.randint(0,151,size =300),
'Tensorflow':np.random.randint(0,151,size=300),
'Java':np.random.randint(0,151,size = 300),
'C++':np.random.randint(0,151,size = 300)})
df.sex = df.sex.map({0:"男",1:"女"})
a = df.groupby(by="sex")
for name,group in a:
print("group_name:",name)
print("group_data:",group)
b = df.groupby(by=["sex"])["Python"]
for name,group in b:
print("group_name:", name)
print("group_data:",group)
c = df.groupby(by=["sex","class"])["Python"]
for name,group in c:
print("group_name:", name)
print("group_data:",group)
d = df.Python.groupby(by=df["class"])
for name,group in d:
print("group_name:", name)
print("group_data:",group)
e = df.groupby(df.dtypes,axis=1)
for name,group in e:
print("group_name:", name)
print("group_data:",group)
f_dict = {
'Python':"IT1",
'Keras':"IT1",
'Tensorflow':"IT2",
'Java':"IT2",
'C++':"IT2"
}
f = df.groupby(f_dict,axis=1)
for name,group in f:
print("group_name:", name)
print("group_data:",group)
分组聚合——groupby
df.groupby(by="sex").mean()
df.groupby(by=["sex","class"]).mean()
分组聚合——apply&transform
apply原理
transform原理
data = {'fruit': ['苹果', '苹果', '苹果', '苹果'],
'year': [202209, 202209, 202208, 202208],
'sales': [202202, 202201, 202204, 202203]
}
df = pd.DataFrame(data)
df.groupby(by=["fruit","year"])["sales"].apply(np.max).reset_index()
df["max_sales"] = df.groupby(by=["fruit","year"])["sales"].transform(np.max)
df
分组聚合——agg
agg原理
data = {'fruit': ['苹果', '苹果', '苹果', '苹果'],
'year': [202209, 202209, 202208, 202208],
'sales': [202202, 202201, 202204, 202203],
'amount':[212201, 212202, 212204, 212205]
}
df = pd.DataFrame(data)
df.groupby(by=["fruit","year"])["sales"].agg([np.min,np.max,pd.Series.count]).reset_index()
df.groupby(by=["fruit","year"])["sales","amount"].agg({"sales":[('最大值',np.max),('最小值',np.min)],"amount":[("计数",pd.Series.count),("平均值",np.mean)]}).reset_index()
def normalize(x):
return (x-x.min())/(x.max()-x.min())
df["normalize"] = df.groupby(by=["fruit","year"])["sales"].transform(normalize)
df