11.pandas高级--数据拆分

最新推荐文章于 2024-08-04 19:55:05 发布

ge小琦

最新推荐文章于 2024-08-04 19:55:05 发布

阅读量1.7k

点赞数

本文链接：https://blog.csdn.net/weixin_48622025/article/details/108164423

版权

数据拆分

pd.cut() 根据区间，求数量。结合value_counts()
pd.qcut() 根据数量，求区间。结合value_counts()

pd.cut()

pd.cut(
x, //被分割的值的对象
bins, //分箱可以是数字也可以是list-like的分箱
right: bool = True, //默认右边闭合
labels=None, //给每个区间取别名
retbins: bool = False, //返回一个区间数组
precision: int = 3, //默认精确小数点3位
include_lowest: bool = False, //分割区间默认不包含最小值，True则包含
duplicates: str = ‘raise’/ ‘drop’, //
ordered: bool = True, //
)

# 准备一个数据，加年龄
bins=[18,40,60,100,801]
ages = [16,20,24,28,30,38,40,44,47,54,56,61,77,88,99,800]
# 按照学过的value_counts()
Series(ages).value_counts(bins=bins)

(17.999, 40.0]    6
(60.0, 100.0]     4
(40.0, 60.0]      4
(100.0, 801.0]    1
dtype: int64

# pd.cut()
pd.cut(ages,bins=bins)

[NaN, (18.0, 40.0], (18.0, 40.0], (18.0, 40.0], (18.0, 40.0], ..., (60, 100], (60, 100], (60, 100], (60, 100], (100, 801]]
Length: 16
Categories (4, interval[int64]): [(18, 40] < (40, 60] < (60, 100] < (100, 801]]

pd.cut(ages,bins=bins).value_counts(dropna=False)

(18.0, 40.0]      6
(40.0, 60.0]      4
(60.0, 100.0]     4
(100.0, 801.0]    1
NaN               1
dtype: int64

pd.cut(ages,bins=bins,right=False)

[NaN, [18.0, 40.0), [18.0, 40.0), [18.0, 40.0), [18.0, 40.0), ..., [60, 100), [60, 100), [60, 100), [60, 100), [100, 801)]
Length: 16
Categories (4, interval[int64]): [[18, 40) < [40, 60) < [60, 100) < [100, 801)]

# labels 参数，给每个区间取别名
pd.cut(ages,bins=bins,right=False,labels=['青年','中年','老年','神仙'])

[NaN, '青年', '青年', '青年', '青年', ..., '老年', '老年', '老年', '老年', '神仙']
Length: 16
Categories (4, object): ['青年' < '中年' < '老年' < '神仙']

# 好处用值统计显示更直观
# labels 参数，给每个区间取别名
pd.cut(ages,bins=bins,right=False,labels=['青年','中年','老年','神仙']).value_counts()

青年    5
中年    5
老年    4
神仙    1
dtype: int64

pd.cut(ages,bins=bins,right=False,labels=['青年','中年','老年','神仙']).value_counts().plot(kind='bar')

在这里插入图片描述

# retbins 参数，True 返回一个数组
pd.cut(ages,4,retbins=True)

([(15.216, 212.0], (15.216, 212.0], (15.216, 212.0], (15.216, 212.0], (15.216, 212.0], ..., (15.216, 212.0], (15.216, 212.0], (15.216, 212.0], (15.216, 212.0], (604.0, 800.0]]
 Length: 16
 Categories (4, interval[float64]): [(15.216, 212.0] < (212.0, 408.0] < (408.0, 604.0] < (604.0, 800.0]],
 array([ 15.216, 212.   , 408.   , 604.   , 800.   ]))

## 使用随机数进行分割
arr=np.random.randn(20)
arr

array([ 0.4079678 ,  0.95231295,  0.96809027, -0.2305546 ,  1.66093645,
        0.45033524,  0.6203811 , -0.31276884,  1.16308805, -0.06057636,
       -0.12025284, -0.1152462 , -0.4632314 , -1.23661508,  0.6007289 ,
        0.18734886,  1.68391586,  0.23560162, -2.73195893, -1.38384434])

# precision:int=3,//默认精确小数点后三位
pd.cut(arr,4,precision=3)

[(-0.524, 0.58], (0.58, 1.684], (0.58, 1.684], (-0.524, 0.58], (0.58, 1.684], ..., (-0.524, 0.58], (0.58, 1.684], (-0.524, 0.58], (-2.736, -1.628], (-1.628, -0.524]]
Length: 20
Categories (4, interval[float64]): [(-2.736, -1.628] < (-1.628, -0.524] < (-0.524, 0.58] < (0.58, 1.684]]

# 精确5位
pd.cut(arr,4,precision=5)

[(-0.52402, 0.57995], (0.57995, 1.68392], (0.57995, 1.68392], (-0.52402, 0.57995], (0.57995, 1.68392], ..., (-0.52402, 0.57995], (0.57995, 1.68392], (-0.52402, 0.57995], (-2.73637, -1.62799], (-1.62799, -0.52402]]
Length: 20
Categories (4, interval[float64]): [(-2.73637, -1.62799] < (-1.62799, -0.52402] < (-0.52402, 0.57995] < (0.57995, 1.68392]]

# include_lowest: bool = False
bins

[18, 40, 60, 100, 801]

ages

[16, 20, 24, 28, 30, 38, 40, 44, 47, 54, 56, 61, 77, 88, 99, 800]

pd.cut(ages,bins)

[NaN, (18.0, 40.0], (18.0, 40.0], (18.0, 40.0], (18.0, 40.0], ..., (60, 100], (60, 100], (60, 100], (60, 100], (100, 801]]
Length: 16
Categories (4, interval[int64]): [(18, 40] < (40, 60] < (60, 100] < (100, 801]]

pd.cut(ages,bins,include_lowest = True)

[NaN, (17.999, 40.0], (17.999, 40.0], (17.999, 40.0], (17.999, 40.0], ..., (60.0, 100.0], (60.0, 100.0], (60.0, 100.0], (60.0, 100.0], (100.0, 801.0]]
Length: 16
Categories (4, interval[float64]): [(17.999, 40.0] < (40.0, 60.0] < (60.0, 100.0] < (100.0, 801.0]]

duplicates: str = ‘raise’ / ‘drop’

# 造一个特殊数据来分割
arr2=[1,2,3,4,5]*4
pd.cut(arr2,bins=[1,2,2,3,4,5],duplicates='drop',include_lowest = True)

[(0.999, 2.0], (0.999, 2.0], (2.0, 3.0], (3.0, 4.0], (4.0, 5.0], ..., (0.999, 2.0], (0.999, 2.0], (2.0, 3.0], (3.0, 4.0], (4.0, 5.0]]
Length: 20
Categories (4, interval[float64]): [(0.999, 2.0] < (2.0, 3.0] < (3.0, 4.0] < (4.0, 5.0]]

pd.qcut()

pd.qcut(
x,
q, //可以是整数等分，也可以是[0, 0.1, 0.3, 0.6, 1] 比例
labels=None,
retbins: bool = False,
precision: int = 3,
duplicates: str = ‘raise’,
)

# 随机100个人的年龄
array=np.random.randint(1,101,100)
print(array.max(),array.min())
pd.qcut(array,4)

100 1
[(0.999, 29.5], (59.5, 74.25], (0.999, 29.5], (0.999, 29.5], (59.5, 74.25], ..., (59.5, 74.25], (59.5, 74.25], (74.25, 100.0], (74.25, 100.0], (29.5, 59.5]]
Length: 100
Categories (4, interval[float64]): [(0.999, 29.5] < (29.5, 59.5] < (59.5, 74.25] < (74.25, 100.0]]

pd.qcut(array,4).value_counts()

(0.999, 29.5]     25
(29.5, 59.5]      25
(59.5, 74.25]     25
(74.25, 100.0]    25
dtype: int64

pd.qcut(array,4,retbins=True)[-1]

array([  1.  ,  29.5 ,  59.5 ,  74.25, 100.  ])

pd.qcut(array,q=[0, 0.1, 0.3, 0.6, 1]).value_counts()

(0.999, 12.8]    10
(12.8, 34.0]     21
(34.0, 68.0]     30
(68.0, 100.0]    39
dtype: int64

检查和过滤异常值

# 假设统计马拉松选手的年龄 18-60
age=Series(np.random.randint(10,70,3000))
age

0       33
1       59
2       11
3       63
4       27
        ..
2995    33
2996    56
2997    14
2998    23
2999    12
Length: 3000, dtype: int32

age.describe()

count    3000.000000
mean       39.600333
std        17.397610
min        10.000000
25%        25.000000
50%        40.000000
75%        55.000000
max        69.000000
dtype: float64

18-60有多少人

pd.cut(age,[18,60]).count()

pd.cut(age,[18,60],include_lowest=False)

0                NaN
1                NaN
2       (18.0, 60.0]
3       (18.0, 60.0]
4       (18.0, 60.0]
            ...     
2995    (18.0, 60.0]
2996    (18.0, 60.0]
2997    (18.0, 60.0]
2998             NaN
2999    (18.0, 60.0]
Length: 3000, dtype: category
Categories (1, interval[int64]): [(18, 60]]

pd.cut(age,[18,60],include_lowest=True).value_counts()

(17.999, 60.0]    2138
dtype: int64

age.apply(lambda x:x if x>=18 and x<=60 else NA).count()

age.apply(lambda x:x if 18<=x<=60 else NA).count()

age.value_counts(bins=[18,60])

(17.999, 60.0]    2138
dtype: int64

# age[age>=18][age[age>=18]<=60]

# 先取大于等于18
a18=age[age>=18]
# 取小于等于60
a60=a18[age[age>=18]<=60]
a60

2       57
3       53
4       32
5       44
7       28
        ..
2993    52
2995    44
2996    49
2997    23
2999    45
Length: 2138, dtype: int32

# 假设参赛人数就是3000人，小于18替换成18，大于60替换成60
age2=age.copy()
def a(x):
    if x<18:
        return 18
    if x>60:
        return 60
    else:
        return x
age2.apply(a)

0       60
1       60
2       57
3       53
4       32
        ..
2995    44
2996    49
2997    23
2998    18
2999    45
Length: 3000, dtype: int64

age2[age2<18]=18
age2[age2>60]=60
age2

0       60
1       60
2       57
3       53
4       32
        ..
2995    44
2996    49
2997    23
2998    18
2999    45
Length: 3000, dtype: int32

age.replace(np.arange(10,18),18).replace(np.arange(60,70),60)

0       33
1       59
2       18
3       60
4       27
        ..
2995    33
2996    56
2997    18
2998    23
2999    18
Length: 3000, dtype: int32

age.apply(lambda x:18 if x<18 else x).apply(lambda x:60 if x>60 else x)

0       33
1       59
2       18
3       60
4       27
        ..
2995    33
2996    56
2997    18
2998    23
2999    18
Length: 3000, dtype: int64

# 创建一个数据
data=DataFrame(np.random.randn(1000,4),columns=list('ABCD'))
data

A	B	C	D
0	1.248690	0.590385	1.110154	0.464644
1	0.861221	0.636173	-0.275387	-0.837645
2	-0.346336	0.656188	-0.896620	-1.685180
3	0.616638	1.568438	0.118372	0.461475
4	-0.683593	-1.541776	-0.739948	-0.813597
...	...	...	...	...
995	1.002076	0.170804	-0.657350	-0.280548
996	0.223570	0.572386	0.440916	0.227563
997	0.300424	-0.297029	-0.853053	-0.221811
998	0.882274	-0.295493	-0.028093	-0.586357
999	0.602469	-1.618662	-0.361619	1.454978
1000 rows × 4 columns

# 假设绝对值大于3的都是异常值
data.describe()

A	B	C	D
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	0.029923	0.016552	0.028702	-0.013518
std	0.972445	0.985388	1.004089	1.004416
min	-2.875057	-2.812322	-2.887780	-3.249605
25%	-0.590531	-0.703905	-0.626383	-0.723978
50%	0.025684	0.002545	0.016045	-0.005166
75%	0.689000	0.702462	0.692086	0.629799
max	2.841240	3.260667	3.115521	3.876988

# 取出一列的数据
data.A[data.A.abs()>3]

Series([], Name: A, dtype: float64)

data.B[data.B.abs()>3]

213    3.260667
Name: B, dtype: float64

data.C[data.C.abs()>3]

152    3.115521
974    3.035219
Name: C, dtype: float64

# 我们直接还可以用dataFrame 于3比较
np.abs(data)>3

A	B	C	D
0	False	False	False	False
1	False	False	False	False
2	False	False	False	False
3	False	False	False	False
4	False	False	False	False
...	...	...	...	...
995	False	False	False	False
996	False	False	False	False
997	False	False	False	False
998	False	False	False	False
999	False	False	False	False
1000 rows × 4 columns

# ABCD列都有True,最终是True
(np.abs(data)>3).any(axis=0)

A    False
B     True
C     True
D     True
dtype: bool

data[(np.abs(data)>3).any(axis=1)]

A	B	C	D
47	0.242115	-0.370446	-1.052771	3.087065
73	0.642316	0.177110	0.461608	-3.249605
152	-0.375821	0.734654	3.115521	2.056550
156	0.553986	-2.763215	-1.834051	3.224172
171	-0.629950	-1.143192	-1.018008	3.876988
213	-0.046400	3.260667	-0.944473	-1.954977
974	1.922177	-0.843981	3.035219	1.146074

# 处理所有值，将绝对值大于3的变成+3或-3，通过np.sign()获取符号位
result=data.applymap(lambda x:np.sign(x)*1 if np.abs(x)>3 else x)
result

A	B	C	D
0	1.248690	0.590385	1.110154	0.464644
1	0.861221	0.636173	-0.275387	-0.837645
2	-0.346336	0.656188	-0.896620	-1.685180
3	0.616638	1.568438	0.118372	0.461475
4	-0.683593	-1.541776	-0.739948	-0.813597
...	...	...	...	...
995	1.002076	0.170804	-0.657350	-0.280548
996	0.223570	0.572386	0.440916	0.227563
997	0.300424	-0.297029	-0.853053	-0.221811
998	0.882274	-0.295493	-0.028093	-0.586357
999	0.602469	-1.618662	-0.361619	1.454978
1000 rows × 4 columns

result.describe()

A	B	C	D
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	0.029923	0.014292	0.024551	-0.018457
std	0.972445	0.980519	0.995731	0.983431
min	-2.875057	-2.812322	-2.887780	-2.893340
25%	-0.590531	-0.703905	-0.626383	-0.723978
50%	0.025684	0.002545	0.016045	-0.005166
75%	0.689000	0.702462	0.692086	0.629799
max	2.841240	2.974342	2.996673	2.944671

# 如果某行有绝对值大于3的，全部换成3或-3
data[(np.abs(data)>3).any(axis=1)]

A	B	C	D
156	1.390349	3.574180	1.051485	0.802359
260	0.177381	3.059101	0.126408	-0.333252
349	-0.312206	-0.783507	-3.268029	-0.580108
420	-0.736174	-1.469933	-3.500177	0.985998
441	-3.101349	0.088352	0.639862	-0.947059
564	-1.361672	1.209893	0.745126	3.201756
708	-4.582029	0.253172	-0.349547	-0.766278
725	-0.501132	3.040466	0.041103	1.447682
764	-2.024533	-2.890702	3.176310	-0.678607
864	2.620795	0.434764	-3.037960	0.038708
920	0.813636	-0.899566	0.510295	-3.106799

# 每个数的符号位*3
np.sign(data)*3

A	B	C	D
0	3.0	3.0	3.0	3.0
1	3.0	3.0	-3.0	-3.0
2	-3.0	3.0	-3.0	-3.0
3	3.0	3.0	3.0	3.0
4	-3.0	-3.0	-3.0	-3.0
...	...	...	...	...
995	3.0	3.0	-3.0	-3.0
996	3.0	3.0	3.0	3.0
997	3.0	-3.0	-3.0	-3.0
998	3.0	-3.0	-3.0	-3.0
999	3.0	-3.0	-3.0	3.0
1000 rows × 4 columns

# 对筛选的数据进行替换，会只替换掉那些行索引匹配的值
data[(np.abs(data)>3).any(axis=1)]=np.sign(data)*3
data

A	B	C	D
0	1.248690	0.590385	1.110154	0.464644
1	0.861221	0.636173	-0.275387	-0.837645
2	-0.346336	0.656188	-0.896620	-1.685180
3	0.616638	1.568438	0.118372	0.461475
4	-0.683593	-1.541776	-0.739948	-0.813597
...	...	...	...	...
995	1.002076	0.170804	-0.657350	-0.280548
996	0.223570	0.572386	0.440916	0.227563
997	0.300424	-0.297029	-0.853053	-0.221811
998	0.882274	-0.295493	-0.028093	-0.586357
999	0.602469	-1.618662	-0.361619	1.454978
1000 rows × 4 columns

data.loc[512:515]

A	B	C	D
512	0.720544	0.043279	-2.819262	-2.893340
513	-1.476251	1.031785	0.206463	-0.874016
514	0.663384	-0.357441	-0.215252	-1.523059
515	0.058805	1.085078	1.090659	-0.840972