cut
cut(
x,
bins,
right=True,
labels=None,
retbins=False,
precision=3,
include_lowest=False,
duplicates="raise",
)
参数 | 说明 |
---|---|
x | 待分类数据 |
bins | 分类方式、int、数组、IntervalIndex |
right | 是否包含右区间,默认True |
labels | 给分类标签 |
retbins | 是否返回分类区间,默认False |
precision | 精度 |
include_lowest | 是否包含左区间,默认False |
duplicates | bins区间有重叠处理方式,'raise’抛出异常, 'drop’删除重复 |
简单示例
import numpy as np
import pandas as pd
from pandas import DataFrame
SIZE = 10
np.random.seed(147258)
df = DataFrame()
# 生成0到30之间的数做为年龄
df['age'] = np.random.randint(1, 31, size=SIZE)
# 4个字符做为名称
df['name'] = [pd.util.testing.rands(4) for i in range(SIZE)]
# 分为3类
df['category'] = pd.cut(df['age'], 3)
print(df)
# 分为4类
bins = [0, 5, 13, 18, 30]
# 个每个分类指定标签
df['category'] = pd.cut(df['age'], bins, labels=['infant', 'child', 'teenager', 'adult'])
print(df)
综合实例
我们经常会遇到数据是数据,按区间
import numpy as np
import pandas as pd
from pandas import DataFrame
SIZE = 20
np.random.seed(147258)
df = DataFrame()
df['amount'] = np.random.randint(1000, 10001, size=SIZE)
# 4个字符做为名称
df['name'] = [pd.util.testing.rands(4) for i in range(SIZE)]
bins = [0, 3000, 5000, 8000, 10000]
# 使用左闭右开区间
df['category'] = pd.cut(df['amount'], bins, labels=['[0,3000)', '[3000,5000)', '[5000,8000)', '[8000,10000)'], right=False, include_lowest=True)
print(df)
# 按category分组 统计每个分组的人数
result = df.groupby('category')['name'].count()
print(result)
# 按cagegory分组,统计每个分组的人数,以及每个分类的总销售额
result = df.groupby('category').agg({"name": "count", "amount": "sum"})
print(result)