11.pandas高级--数据拆分

import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from numpy import nan as NA
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']

数据拆分

pd.cut() 根据区间,求数量。结合value_counts()
pd.qcut() 根据数量,求区间。结合value_counts()

pd.cut()

pd.cut(
x, //被分割的值的对象
bins, //分箱可以是数字也可以是list-like的分箱
right: bool = True, //默认右边闭合
labels=None, //给每个区间取别名
retbins: bool = False, //返回一个区间数组
precision: int = 3, //默认精确小数点3位
include_lowest: bool = False, //分割区间默认不包含最小值,True则包含
duplicates: str = ‘raise’/ ‘drop’, //
ordered: bool = True, //
)

# 准备一个数据,加年龄
bins=[18,40,60,100,801]
ages = [16,20,24,28,30,38,40,44,47,54,56,61,77,88,99,800]
# 按照学过的value_counts()
Series(ages).value_counts(bins=bins)
(17.999, 40.0]    6
(60.0, 100.0]     4
(40.0, 60.0]      4
(100.0, 801.0]    1
dtype: int64
# pd.cut()
pd.cut(ages,bins=bins)
[NaN, (18.0, 40.0], (18.0, 40.0], (18.0, 40.0], (18.0, 40.0], ..., (60, 100], (60, 100], (60, 100], (60, 100], (100, 801]]
Length: 16
Categories (4, interval[int64]): [(18, 40] < (40, 60] < (60, 100] < (100, 801]]
pd.cut(ages,bins=bins).value_counts(dropna=False)
(18.0, 40.0]      6
(40.0, 60.0]      4
(60.0, 100.0]     4
(100.0, 801.0]    1
NaN               1
dtype: int64
pd.cut(ages,bins=bins,right=False)
[NaN, [18.0, 40.0), [18.0, 40.0), [18.0, 40.0), [18.0, 40.0), ..., [60, 100), [60, 100), [60, 100), [60, 100), [100, 801)]
Length: 16
Categories (4, interval[int64]): [[18, 40) < [40, 60) < [60, 100) < [100, 801)]
# labels 参数,给每个区间取别名
pd.cut(ages,bins=bins,right=False,labels=['青年','中年','老年','神仙'])
[NaN, '青年', '青年', '青年', '青年', ..., '老年', '老年', '老年', '老年', '神仙']
Length: 16
Categories (4, object): ['青年' < '中年' < '老年' < '神仙']
# 好处用值统计显示更直观
# labels 参数,给每个区间取别名
pd.cut(ages,bins=bins,right=False,labels=['青年','中年','老年','神仙']).value_counts()
青年    5
中年    5
老年    4
神仙    1
dtype: int64
pd.cut(ages,bins=bins,right=False,labels=['青年','中年','老年','神仙']).value_counts().plot(kind='bar')

在这里插入图片描述

# retbins 参数,True 返回一个数组
pd.cut(ages,4,retbins=True)
([(15.216, 212.0], (15.216, 212.0], (15.216, 212.0], (15.216, 212.0], (15.216, 212.0], ..., (15.216, 212.0], (15.216, 212.0], (15.216, 212.0], (15.216, 212.0], (604.0, 800.0]]
 Length: 16
 Categories (4, interval[float64]): [(15.216, 212.0] < (212.0, 408.0] < (408.0, 604.0] < (604.0, 800.0]],
 array([ 15.216, 212.   , 408.   , 604.   , 800.   ]))
## 使用随机数进行分割
arr=np.random.randn(20)
arr
array([ 0.4079678 ,  0.95231295,  0.96809027, -0.2305546 ,  1.66093645,
        0.45033524,  0.6203811 , -0.31276884,  1.16308805, -0.06057636,
       -0.12025284, -0.1152462 , -0.4632314 , -1.23661508,  0.6007289 ,
        0.18734886,  1.68391586,  0.23560162, -2.73195893, -1.38384434])
# precision:int=3,//默认精确小数点后三位
pd.cut(arr,4,precision=3)
[(-0.524, 0.58], (0.58, 1.684], (0.58, 1.684], (-0.524, 0.58], (0.58, 1.684], ..., (-0.524, 0.58], (0.58, 1.684], (-0.524, 0.58], (-2.736, -1.628], (-1.628, -0.524]]
Length: 20
Categories (4, interval[float64]): [(-2.736, -1.628] < (-1.628, -0.524] < (-0.524, 0.58] < (0.58, 1.684]]
# 精确5位
pd.cut(arr,4,precision=5)
[(-0.52402, 0.57995], (0.57995, 1.68392], (0.57995, 1.68392], (-0.52402, 0.57995], (0.57995, 1.68392], ..., (-0.52402, 0.57995], (0.57995, 1.68392], (-0.52402, 0.57995], (-2.73637, -1.62799], (-1.62799, -0.52402]]
Length: 20
Categories (4, interval[float64]): [(-2.73637, -1.62799] < (-1.62799, -0.52402] < (-0.52402, 0.57995] < (0.57995, 1.68392]]
# include_lowest: bool = False
bins
[18, 40, 60, 100, 801]
ages
[16, 20, 24, 28, 30, 38, 40, 44, 47, 54, 56, 61, 77, 88, 99, 800]
pd.cut(ages,bins)
[NaN, (18.0, 40.0], (18.0, 40.0], (18.0, 40.0], (18.0, 40.0], ..., (60, 100], (60, 100], (60, 100], (60, 100], (100, 801]]
Length: 16
Categories (4, interval[int64]): [(18, 40] < (40, 60] < (60, 100] < (100, 801]]
pd.cut(ages,bins,include_lowest = True)
[NaN, (17.999, 40.0], (17.999, 40.0], (17.999, 40.0], (17.999, 40.0], ..., (60.0, 100.0], (60.0, 100.0], (60.0, 100.0], (60.0, 100.0], (100.0, 801.0]]
Length: 16
Categories (4, interval[float64]): [(17.999, 40.0] < (40.0, 60.0] < (60.0, 100.0] < (100.0, 801.0]]

duplicates: str = ‘raise’ / ‘drop’

# 造一个特殊数据来分割
arr2=[1,2,3,4,5]*4
pd.cut(arr2,bins=[1,2,2,3,4,5],duplicates='drop',include_lowest = True)
[(0.999, 2.0], (0.999, 2.0], (2.0, 3.0], (3.0, 4.0], (4.0, 5.0], ..., (0.999, 2.0], (0.999, 2.0], (2.0, 3.0], (3.0, 4.0], (4.0, 5.0]]
Length: 20
Categories (4, interval[float64]): [(0.999, 2.0] < (2.0, 3.0] < (3.0, 4.0] < (4.0, 5.0]]

pd.qcut()

pd.qcut(
x,
q, //可以是整数等分,也可以是[0, 0.1, 0.3, 0.6, 1] 比例
labels=None,
retbins: bool = False,
precision: int = 3,
duplicates: str = ‘raise’,
)

# 随机100个人的年龄
array=np.random.randint(1,101,100)
print(array.max(),array.min())
pd.qcut(array,4)
100 1
[(0.999, 29.5], (59.5, 74.25], (0.999, 29.5], (0.999, 29.5], (59.5, 74.25], ..., (59.5, 74.25], (59.5, 74.25], (74.25, 100.0], (74.25, 100.0], (29.5, 59.5]]
Length: 100
Categories (4, interval[float64]): [(0.999, 29.5] < (29.5, 59.5] < (59.5, 74.25] < (74.25, 100.0]]
pd.qcut(array,4).value_counts()
(0.999, 29.5]     25
(29.5, 59.5]      25
(59.5, 74.25]     25
(74.25, 100.0]    25
dtype: int64
pd.qcut(array,4,retbins=True)[-1]
array([  1.  ,  29.5 ,  59.5 ,  74.25, 100.  ])
pd.qcut(array,q=[0, 0.1, 0.3, 0.6, 1]).value_counts()
(0.999, 12.8]    10
(12.8, 34.0]     21
(34.0, 68.0]     30
(68.0, 100.0]    39
dtype: int64

检查和过滤异常值

# 假设统计马拉松选手的年龄 18-60
age=Series(np.random.randint(10,70,3000))
age
0       33
1       59
2       11
3       63
4       27
        ..
2995    33
2996    56
2997    14
2998    23
2999    12
Length: 3000, dtype: int32
age.describe()
count    3000.000000
mean       39.600333
std        17.397610
min        10.000000
25%        25.000000
50%        40.000000
75%        55.000000
max        69.000000
dtype: float64
18-60有多少人
pd.cut(age,[18,60]).count()
2098
pd.cut(age,[18,60],include_lowest=False)
0                NaN
1                NaN
2       (18.0, 60.0]
3       (18.0, 60.0]
4       (18.0, 60.0]
            ...     
2995    (18.0, 60.0]
2996    (18.0, 60.0]
2997    (18.0, 60.0]
2998             NaN
2999    (18.0, 60.0]
Length: 3000, dtype: category
Categories (1, interval[int64]): [(18, 60]]
pd.cut(age,[18,60],include_lowest=True).value_counts()
(17.999, 60.0]    2138
dtype: int64
age.apply(lambda x:x if x>=18 and x<=60 else NA).count()
2138
age.apply(lambda x:x if 18<=x<=60 else NA).count()
2138
age.value_counts(bins=[18,60])
(17.999, 60.0]    2138
dtype: int64
# age[age>=18][age[age>=18]<=60]# 先取大于等于18
a18=age[age>=18]
# 取小于等于60
a60=a18[age[age>=18]<=60]
a60
2       57
3       53
4       32
5       44
7       28
        ..
2993    52
2995    44
2996    49
2997    23
2999    45
Length: 2138, dtype: int32
# 假设参赛人数就是3000人,小于18替换成18,大于60替换成60
age2=age.copy()
def a(x):
    if x<18:
        return 18
    if x>60:
        return 60
    else:
        return x
age2.apply(a)
0       60
1       60
2       57
3       53
4       32
        ..
2995    44
2996    49
2997    23
2998    18
2999    45
Length: 3000, dtype: int64
age2[age2<18]=18
age2[age2>60]=60
age2
0       60
1       60
2       57
3       53
4       32
        ..
2995    44
2996    49
2997    23
2998    18
2999    45
Length: 3000, dtype: int32
age.replace(np.arange(10,18),18).replace(np.arange(60,70),60)
0       33
1       59
2       18
3       60
4       27
        ..
2995    33
2996    56
2997    18
2998    23
2999    18
Length: 3000, dtype: int32
age.apply(lambda x:18 if x<18 else x).apply(lambda x:60 if x>60 else x)
0       33
1       59
2       18
3       60
4       27
        ..
2995    33
2996    56
2997    18
2998    23
2999    18
Length: 3000, dtype: int64
# 创建一个数据
data=DataFrame(np.random.randn(1000,4),columns=list('ABCD'))
data
A	B	C	D
0	1.248690	0.590385	1.110154	0.464644
1	0.861221	0.636173	-0.275387	-0.837645
2	-0.346336	0.656188	-0.896620	-1.685180
3	0.616638	1.568438	0.118372	0.461475
4	-0.683593	-1.541776	-0.739948	-0.813597
...	...	...	...	...
995	1.002076	0.170804	-0.657350	-0.280548
996	0.223570	0.572386	0.440916	0.227563
997	0.300424	-0.297029	-0.853053	-0.221811
998	0.882274	-0.295493	-0.028093	-0.586357
999	0.602469	-1.618662	-0.361619	1.454978
1000 rows × 4 columns
# 假设绝对值大于3的都是异常值
data.describe()
A	B	C	D
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	0.029923	0.016552	0.028702	-0.013518
std	0.972445	0.985388	1.004089	1.004416
min	-2.875057	-2.812322	-2.887780	-3.249605
25%	-0.590531	-0.703905	-0.626383	-0.723978
50%	0.025684	0.002545	0.016045	-0.005166
75%	0.689000	0.702462	0.692086	0.629799
max	2.841240	3.260667	3.115521	3.876988
# 取出一列的数据
data.A[data.A.abs()>3]
Series([], Name: A, dtype: float64)
data.B[data.B.abs()>3]
213    3.260667
Name: B, dtype: float64
data.C[data.C.abs()>3]
152    3.115521
974    3.035219
Name: C, dtype: float64
# 我们直接还可以用dataFrame 于3比较
np.abs(data)>3
A	B	C	D
0	False	False	False	False
1	False	False	False	False
2	False	False	False	False
3	False	False	False	False
4	False	False	False	False
...	...	...	...	...
995	False	False	False	False
996	False	False	False	False
997	False	False	False	False
998	False	False	False	False
999	False	False	False	False
1000 rows × 4 columns
# ABCD列都有True,最终是True
(np.abs(data)>3).any(axis=0)
A    False
B     True
C     True
D     True
dtype: bool
data[(np.abs(data)>3).any(axis=1)]
A	B	C	D
47	0.242115	-0.370446	-1.052771	3.087065
73	0.642316	0.177110	0.461608	-3.249605
152	-0.375821	0.734654	3.115521	2.056550
156	0.553986	-2.763215	-1.834051	3.224172
171	-0.629950	-1.143192	-1.018008	3.876988
213	-0.046400	3.260667	-0.944473	-1.954977
974	1.922177	-0.843981	3.035219	1.146074
# 处理所有值,将绝对值大于3的变成+3或-3,通过np.sign()获取符号位
result=data.applymap(lambda x:np.sign(x)*1 if np.abs(x)>3 else x)
result
A	B	C	D
0	1.248690	0.590385	1.110154	0.464644
1	0.861221	0.636173	-0.275387	-0.837645
2	-0.346336	0.656188	-0.896620	-1.685180
3	0.616638	1.568438	0.118372	0.461475
4	-0.683593	-1.541776	-0.739948	-0.813597
...	...	...	...	...
995	1.002076	0.170804	-0.657350	-0.280548
996	0.223570	0.572386	0.440916	0.227563
997	0.300424	-0.297029	-0.853053	-0.221811
998	0.882274	-0.295493	-0.028093	-0.586357
999	0.602469	-1.618662	-0.361619	1.454978
1000 rows × 4 columns
result.describe()
A	B	C	D
count	1000.000000	1000.000000	1000.000000	1000.000000
mean	0.029923	0.014292	0.024551	-0.018457
std	0.972445	0.980519	0.995731	0.983431
min	-2.875057	-2.812322	-2.887780	-2.893340
25%	-0.590531	-0.703905	-0.626383	-0.723978
50%	0.025684	0.002545	0.016045	-0.005166
75%	0.689000	0.702462	0.692086	0.629799
max	2.841240	2.974342	2.996673	2.944671
# 如果某行有绝对值大于3的,全部换成3或-3
data[(np.abs(data)>3).any(axis=1)]
A	B	C	D
156	1.390349	3.574180	1.051485	0.802359
260	0.177381	3.059101	0.126408	-0.333252
349	-0.312206	-0.783507	-3.268029	-0.580108
420	-0.736174	-1.469933	-3.500177	0.985998
441	-3.101349	0.088352	0.639862	-0.947059
564	-1.361672	1.209893	0.745126	3.201756
708	-4.582029	0.253172	-0.349547	-0.766278
725	-0.501132	3.040466	0.041103	1.447682
764	-2.024533	-2.890702	3.176310	-0.678607
864	2.620795	0.434764	-3.037960	0.038708
920	0.813636	-0.899566	0.510295	-3.106799
# 每个数的符号位*3
np.sign(data)*3
A	B	C	D
0	3.0	3.0	3.0	3.0
1	3.0	3.0	-3.0	-3.0
2	-3.0	3.0	-3.0	-3.0
3	3.0	3.0	3.0	3.0
4	-3.0	-3.0	-3.0	-3.0
...	...	...	...	...
995	3.0	3.0	-3.0	-3.0
996	3.0	3.0	3.0	3.0
997	3.0	-3.0	-3.0	-3.0
998	3.0	-3.0	-3.0	-3.0
999	3.0	-3.0	-3.0	3.0
1000 rows × 4 columns
# 对筛选的数据进行替换,会只替换掉那些行索引匹配的值
data[(np.abs(data)>3).any(axis=1)]=np.sign(data)*3
data
A	B	C	D
0	1.248690	0.590385	1.110154	0.464644
1	0.861221	0.636173	-0.275387	-0.837645
2	-0.346336	0.656188	-0.896620	-1.685180
3	0.616638	1.568438	0.118372	0.461475
4	-0.683593	-1.541776	-0.739948	-0.813597
...	...	...	...	...
995	1.002076	0.170804	-0.657350	-0.280548
996	0.223570	0.572386	0.440916	0.227563
997	0.300424	-0.297029	-0.853053	-0.221811
998	0.882274	-0.295493	-0.028093	-0.586357
999	0.602469	-1.618662	-0.361619	1.454978
1000 rows × 4 columns
data.loc[512:515]
A	B	C	D
512	0.720544	0.043279	-2.819262	-2.893340
513	-1.476251	1.031785	0.206463	-0.874016
514	0.663384	-0.357441	-0.215252	-1.523059
515	0.058805	1.085078	1.090659	-0.840972

Series.apply(某个函数)

DataFrame.applymap(某个函数)

都是对单个值处理的

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值