目录
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from numpy import nan as NA
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
数据拆分
pd.cut() 根据区间,求数量。结合value_counts()
pd.qcut() 根据数量,求区间。结合value_counts()
pd.cut()
pd.cut(
x, //被分割的值的对象
bins, //分箱可以是数字也可以是list-like的分箱
right: bool = True, //默认右边闭合
labels=None, //给每个区间取别名
retbins: bool = False, //返回一个区间数组
precision: int = 3, //默认精确小数点3位
include_lowest: bool = False, //分割区间默认不包含最小值,True则包含
duplicates: str = ‘raise’/ ‘drop’, //
ordered: bool = True, //
)
# 准备一个数据,加年龄
bins=[18,40,60,100,801]
ages = [16,20,24,28,30,38,40,44,47,54,56,61,77,88,99,800]
# 按照学过的value_counts()
Series(ages).value_counts(bins=bins)
(17.999, 40.0] 6
(60.0, 100.0] 4
(40.0, 60.0] 4
(100.0, 801.0] 1
dtype: int64
# pd.cut()
pd.cut(ages,bins=bins)
[NaN, (18.0, 40.0], (18.0, 40.0], (18.0, 40.0], (18.0, 40.0], ..., (60, 100], (60, 100], (60, 100], (60, 100], (100, 801]]
Length: 16
Categories (4, interval[int64]): [(18, 40] < (40, 60] < (60, 100] < (100, 801]]
pd.cut(ages,bins=bins).value_counts(dropna=False)
(18.0, 40.0] 6
(40.0, 60.0] 4
(60.0, 100.0] 4
(100.0, 801.0] 1
NaN 1
dtype: int64
pd.cut(ages,bins=bins,right=False)
[NaN, [18.0, 40.0), [18.0, 40.0), [18.0, 40.0), [18.0, 40.0), ..., [60, 100), [60, 100), [60, 100), [60, 100), [100, 801)]
Length: 16
Categories (4, interval[int64]): [[18, 40) < [40, 60) < [60, 100) < [100, 801)]
# labels 参数,给每个区间取别名
pd.cut(ages,bins=bins,right=False,labels=['青年','中年','老年','神仙'])
[NaN, '青年', '青年', '青年', '青年', ..., '老年', '老年', '老年', '老年', '神仙']
Length: 16
Categories (4, object): ['青年' < '中年' < '老年' < '神仙']
# 好处用值统计显示更直观
# labels 参数,给每个区间取别名
pd.cut(ages,bins=bins,right=False,labels=['青年','中年','老年','神仙']).value_counts()
青年 5
中年 5
老年 4
神仙 1
dtype: int64
pd.cut(ages,bins=bins,right=False,labels=['青年'