目录
import numpy as np
import pandas as pd
from pandas import Series,DataFrame
from numpy import nan as NA
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['Microsoft YaHei']
数据拆分
pd.cut() 根据区间,求数量。结合value_counts()
pd.qcut() 根据数量,求区间。结合value_counts()
pd.cut()
pd.cut(
x, //被分割的值的对象
bins, //分箱可以是数字也可以是list-like的分箱
right: bool = True, //默认右边闭合
labels=None, //给每个区间取别名
retbins: bool = False, //返回一个区间数组
precision: int = 3, //默认精确小数点3位
include_lowest: bool = False, //分割区间默认不包含最小值,True则包含
duplicates: str = ‘raise’/ ‘drop’, //
ordered: bool = True, //
)
# 准备一个数据,加年龄
bins=[18,40,60,100,801]
ages = [16,20,24,28,30,38,40,44,47,54,56,61,77,88,99,800]
# 按照学过的value_counts()
Series(ages).value_counts(bins=bins)
(17.999, 40.0] 6
(60.0, 100.0] 4
(40.0, 60.0] 4
(100.0, 801.0] 1
dtype: int64
# pd.cut()
pd.cut(ages,bins=bins)
[NaN, (18.0, 40.0], (18.0, 40.0], (18.0, 40.0], (18.0, 40.0], ..., (60, 100], (60, 100], (60, 100], (60, 100], (100, 801]]
Length: 16
Categories (4, interval[int64]): [(18, 40] < (40, 60] < (60, 100] < (100, 801]]
pd.cut(ages,bins=bins).value_counts(dropna=False)
(18.0, 40.0] 6
(40.0, 60.0] 4
(60.0, 100.0] 4
(100.0, 801.0] 1
NaN 1
dtype: int64
pd.cut(ages,bins=bins,right=False)
[NaN, [18.0, 40.0), [18.0, 40.0), [18.0, 40.0), [18.0, 40.0), ..., [60, 100), [60, 100), [60, 100), [60, 100), [100, 801)]
Length: 16
Categories (4, interval[int64]): [[18, 40) < [40, 60) < [60, 100) < [100, 801)]
# labels 参数,给每个区间取别名
pd.cut(ages,bins=bins,right=False,labels=['青年','中年','老年','神仙'])
[NaN, '青年', '青年', '青年', '青年', ..., '老年', '老年', '老年', '老年', '神仙']
Length: 16
Categories (4, object): ['青年' < '中年' < '老年' < '神仙']
# 好处用值统计显示更直观
# labels 参数,给每个区间取别名
pd.cut(ages,bins=bins,right=False,labels=['青年','中年','老年','神仙']).value_counts()
青年 5
中年 5
老年 4
神仙 1
dtype: int64
pd.cut(ages,bins=bins,right=False,labels=['青年','中年','老年','神仙']).value_counts().plot(kind='bar')
# retbins 参数,True 返回一个数组
pd.cut(ages,4,retbins=True)
([(15.216, 212.0], (15.216, 212.0], (15.216, 212.0], (15.216, 212.0], (15.216, 212.0], ..., (15.216, 212.0], (15.216, 212.0], (15.216, 212.0], (15.216, 212.0], (604.0, 800.0]]
Length: 16
Categories (4, interval[float64]): [(15.216, 212.0] < (212.0, 408.0] < (408.0, 604.0] < (604.0, 800.0]],
array([ 15.216, 212. , 408. , 604. , 800. ]))
## 使用随机数进行分割
arr=np.random.randn(20)
arr
array([ 0.4079678 , 0.95231295, 0.96809027, -0.2305546 , 1.66093645,
0.45033524, 0.6203811 , -0.31276884, 1.16308805, -0.06057636,
-0.12025284, -0.1152462 , -0.4632314 , -1.23661508, 0.6007289 ,
0.18734886, 1.68391586, 0.23560162, -2.73195893, -1.38384434])
# precision:int=3,//默认精确小数点后三位
pd.cut(arr,4,precision=3)
[(-0.524, 0.58], (0.58, 1.684], (0.58, 1.684], (-0.524, 0.58], (0.58, 1.684], ..., (-0.524, 0.58], (0.58, 1.684], (-0.524, 0.58], (-2.736, -1.628], (-1.628, -0.524]]
Length: 20
Categories (4, interval[float64]): [(-2.736, -1.628] < (-1.628, -0.524] < (-0.524, 0.58] < (0.58, 1.684]]
# 精确5位
pd.cut(arr,4,precision=5)
[(-0.52402, 0.57995], (0.57995, 1.68392], (0.57995, 1.68392], (-0.52402, 0.57995], (0.57995, 1.68392], ..., (-0.52402, 0.57995], (0.57995, 1.68392], (-0.52402, 0.57995], (-2.73637, -1.62799], (-1.62799, -0.52402]]
Length: 20
Categories (4, interval[float64]): [(-2.73637, -1.62799] < (-1.62799, -0.52402] < (-0.52402, 0.57995] < (0.57995, 1.68392]]
# include_lowest: bool = False
bins
[18, 40, 60, 100, 801]
ages
[16, 20, 24, 28, 30, 38, 40, 44, 47, 54, 56, 61, 77, 88, 99, 800]
pd.cut(ages,bins)
[NaN, (18.0, 40.0], (18.0, 40.0], (18.0, 40.0], (18.0, 40.0], ..., (60, 100], (60, 100], (60, 100], (60, 100], (100, 801]]
Length: 16
Categories (4, interval[int64]): [(18, 40] < (40, 60] < (60, 100] < (100, 801]]
pd.cut(ages,bins,include_lowest = True)
[NaN, (17.999, 40.0], (17.999, 40.0], (17.999, 40.0], (17.999, 40.0], ..., (60.0, 100.0], (60.0, 100.0], (60.0, 100.0], (60.0, 100.0], (100.0, 801.0]]
Length: 16
Categories (4, interval[float64]): [(17.999, 40.0] < (40.0, 60.0] < (60.0, 100.0] < (100.0, 801.0]]
duplicates: str = ‘raise’ / ‘drop’
# 造一个特殊数据来分割
arr2=[1,2,3,4,5]*4
pd.cut(arr2,bins=[1,2,2,3,4,5],duplicates='drop',include_lowest = True)
[(0.999, 2.0], (0.999, 2.0], (2.0, 3.0], (3.0, 4.0], (4.0, 5.0], ..., (0.999, 2.0], (0.999, 2.0], (2.0, 3.0], (3.0, 4.0], (4.0, 5.0]]
Length: 20
Categories (4, interval[float64]): [(0.999, 2.0] < (2.0, 3.0] < (3.0, 4.0] < (4.0, 5.0]]
pd.qcut()
pd.qcut(
x,
q, //可以是整数等分,也可以是[0, 0.1, 0.3, 0.6, 1] 比例
labels=None,
retbins: bool = False,
precision: int = 3,
duplicates: str = ‘raise’,
)
# 随机100个人的年龄
array=np.random.randint(1,101,100)
print(array.max(),array.min())
pd.qcut(array,4)
100 1
[(0.999, 29.5], (59.5, 74.25], (0.999, 29.5], (0.999, 29.5], (59.5, 74.25], ..., (59.5, 74.25], (59.5, 74.25], (74.25, 100.0], (74.25, 100.0], (29.5, 59.5]]
Length: 100
Categories (4, interval[float64]): [(0.999, 29.5] < (29.5, 59.5] < (59.5, 74.25] < (74.25, 100.0]]
pd.qcut(array,4).value_counts()
(0.999, 29.5] 25
(29.5, 59.5] 25
(59.5, 74.25] 25
(74.25, 100.0] 25
dtype: int64
pd.qcut(array,4,retbins=True)[-1]
array([ 1. , 29.5 , 59.5 , 74.25, 100. ])
pd.qcut(array,q=[0, 0.1, 0.3, 0.6, 1]).value_counts()
(0.999, 12.8] 10
(12.8, 34.0] 21
(34.0, 68.0] 30
(68.0, 100.0] 39
dtype: int64
检查和过滤异常值
# 假设统计马拉松选手的年龄 18-60
age=Series(np.random.randint(10,70,3000))
age
0 33
1 59
2 11
3 63
4 27
..
2995 33
2996 56
2997 14
2998 23
2999 12
Length: 3000, dtype: int32
age.describe()
count 3000.000000
mean 39.600333
std 17.397610
min 10.000000
25% 25.000000
50% 40.000000
75% 55.000000
max 69.000000
dtype: float64
18-60有多少人
pd.cut(age,[18,60]).count()
2098
pd.cut(age,[18,60],include_lowest=False)
0 NaN
1 NaN
2 (18.0, 60.0]
3 (18.0, 60.0]
4 (18.0, 60.0]
...
2995 (18.0, 60.0]
2996 (18.0, 60.0]
2997 (18.0, 60.0]
2998 NaN
2999 (18.0, 60.0]
Length: 3000, dtype: category
Categories (1, interval[int64]): [(18, 60]]
pd.cut(age,[18,60],include_lowest=True).value_counts()
(17.999, 60.0] 2138
dtype: int64
age.apply(lambda x:x if x>=18 and x<=60 else NA).count()
2138
age.apply(lambda x:x if 18<=x<=60 else NA).count()
2138
age.value_counts(bins=[18,60])
(17.999, 60.0] 2138
dtype: int64
# age[age>=18][age[age>=18]<=60]
# 先取大于等于18
a18=age[age>=18]
# 取小于等于60
a60=a18[age[age>=18]<=60]
a60
2 57
3 53
4 32
5 44
7 28
..
2993 52
2995 44
2996 49
2997 23
2999 45
Length: 2138, dtype: int32
# 假设参赛人数就是3000人,小于18替换成18,大于60替换成60
age2=age.copy()
def a(x):
if x<18:
return 18
if x>60:
return 60
else:
return x
age2.apply(a)
0 60
1 60
2 57
3 53
4 32
..
2995 44
2996 49
2997 23
2998 18
2999 45
Length: 3000, dtype: int64
age2[age2<18]=18
age2[age2>60]=60
age2
0 60
1 60
2 57
3 53
4 32
..
2995 44
2996 49
2997 23
2998 18
2999 45
Length: 3000, dtype: int32
age.replace(np.arange(10,18),18).replace(np.arange(60,70),60)
0 33
1 59
2 18
3 60
4 27
..
2995 33
2996 56
2997 18
2998 23
2999 18
Length: 3000, dtype: int32
age.apply(lambda x:18 if x<18 else x).apply(lambda x:60 if x>60 else x)
0 33
1 59
2 18
3 60
4 27
..
2995 33
2996 56
2997 18
2998 23
2999 18
Length: 3000, dtype: int64
# 创建一个数据
data=DataFrame(np.random.randn(1000,4),columns=list('ABCD'))
data
A B C D
0 1.248690 0.590385 1.110154 0.464644
1 0.861221 0.636173 -0.275387 -0.837645
2 -0.346336 0.656188 -0.896620 -1.685180
3 0.616638 1.568438 0.118372 0.461475
4 -0.683593 -1.541776 -0.739948 -0.813597
... ... ... ... ...
995 1.002076 0.170804 -0.657350 -0.280548
996 0.223570 0.572386 0.440916 0.227563
997 0.300424 -0.297029 -0.853053 -0.221811
998 0.882274 -0.295493 -0.028093 -0.586357
999 0.602469 -1.618662 -0.361619 1.454978
1000 rows × 4 columns
# 假设绝对值大于3的都是异常值
data.describe()
A B C D
count 1000.000000 1000.000000 1000.000000 1000.000000
mean 0.029923 0.016552 0.028702 -0.013518
std 0.972445 0.985388 1.004089 1.004416
min -2.875057 -2.812322 -2.887780 -3.249605
25% -0.590531 -0.703905 -0.626383 -0.723978
50% 0.025684 0.002545 0.016045 -0.005166
75% 0.689000 0.702462 0.692086 0.629799
max 2.841240 3.260667 3.115521 3.876988
# 取出一列的数据
data.A[data.A.abs()>3]
Series([], Name: A, dtype: float64)
data.B[data.B.abs()>3]
213 3.260667
Name: B, dtype: float64
data.C[data.C.abs()>3]
152 3.115521
974 3.035219
Name: C, dtype: float64
# 我们直接还可以用dataFrame 于3比较
np.abs(data)>3
A B C D
0 False False False False
1 False False False False
2 False False False False
3 False False False False
4 False False False False
... ... ... ... ...
995 False False False False
996 False False False False
997 False False False False
998 False False False False
999 False False False False
1000 rows × 4 columns
# ABCD列都有True,最终是True
(np.abs(data)>3).any(axis=0)
A False
B True
C True
D True
dtype: bool
data[(np.abs(data)>3).any(axis=1)]
A B C D
47 0.242115 -0.370446 -1.052771 3.087065
73 0.642316 0.177110 0.461608 -3.249605
152 -0.375821 0.734654 3.115521 2.056550
156 0.553986 -2.763215 -1.834051 3.224172
171 -0.629950 -1.143192 -1.018008 3.876988
213 -0.046400 3.260667 -0.944473 -1.954977
974 1.922177 -0.843981 3.035219 1.146074
# 处理所有值,将绝对值大于3的变成+3或-3,通过np.sign()获取符号位
result=data.applymap(lambda x:np.sign(x)*1 if np.abs(x)>3 else x)
result
A B C D
0 1.248690 0.590385 1.110154 0.464644
1 0.861221 0.636173 -0.275387 -0.837645
2 -0.346336 0.656188 -0.896620 -1.685180
3 0.616638 1.568438 0.118372 0.461475
4 -0.683593 -1.541776 -0.739948 -0.813597
... ... ... ... ...
995 1.002076 0.170804 -0.657350 -0.280548
996 0.223570 0.572386 0.440916 0.227563
997 0.300424 -0.297029 -0.853053 -0.221811
998 0.882274 -0.295493 -0.028093 -0.586357
999 0.602469 -1.618662 -0.361619 1.454978
1000 rows × 4 columns
result.describe()
A B C D
count 1000.000000 1000.000000 1000.000000 1000.000000
mean 0.029923 0.014292 0.024551 -0.018457
std 0.972445 0.980519 0.995731 0.983431
min -2.875057 -2.812322 -2.887780 -2.893340
25% -0.590531 -0.703905 -0.626383 -0.723978
50% 0.025684 0.002545 0.016045 -0.005166
75% 0.689000 0.702462 0.692086 0.629799
max 2.841240 2.974342 2.996673 2.944671
# 如果某行有绝对值大于3的,全部换成3或-3
data[(np.abs(data)>3).any(axis=1)]
A B C D
156 1.390349 3.574180 1.051485 0.802359
260 0.177381 3.059101 0.126408 -0.333252
349 -0.312206 -0.783507 -3.268029 -0.580108
420 -0.736174 -1.469933 -3.500177 0.985998
441 -3.101349 0.088352 0.639862 -0.947059
564 -1.361672 1.209893 0.745126 3.201756
708 -4.582029 0.253172 -0.349547 -0.766278
725 -0.501132 3.040466 0.041103 1.447682
764 -2.024533 -2.890702 3.176310 -0.678607
864 2.620795 0.434764 -3.037960 0.038708
920 0.813636 -0.899566 0.510295 -3.106799
# 每个数的符号位*3
np.sign(data)*3
A B C D
0 3.0 3.0 3.0 3.0
1 3.0 3.0 -3.0 -3.0
2 -3.0 3.0 -3.0 -3.0
3 3.0 3.0 3.0 3.0
4 -3.0 -3.0 -3.0 -3.0
... ... ... ... ...
995 3.0 3.0 -3.0 -3.0
996 3.0 3.0 3.0 3.0
997 3.0 -3.0 -3.0 -3.0
998 3.0 -3.0 -3.0 -3.0
999 3.0 -3.0 -3.0 3.0
1000 rows × 4 columns
# 对筛选的数据进行替换,会只替换掉那些行索引匹配的值
data[(np.abs(data)>3).any(axis=1)]=np.sign(data)*3
data
A B C D
0 1.248690 0.590385 1.110154 0.464644
1 0.861221 0.636173 -0.275387 -0.837645
2 -0.346336 0.656188 -0.896620 -1.685180
3 0.616638 1.568438 0.118372 0.461475
4 -0.683593 -1.541776 -0.739948 -0.813597
... ... ... ... ...
995 1.002076 0.170804 -0.657350 -0.280548
996 0.223570 0.572386 0.440916 0.227563
997 0.300424 -0.297029 -0.853053 -0.221811
998 0.882274 -0.295493 -0.028093 -0.586357
999 0.602469 -1.618662 -0.361619 1.454978
1000 rows × 4 columns
data.loc[512:515]
A B C D
512 0.720544 0.043279 -2.819262 -2.893340
513 -1.476251 1.031785 0.206463 -0.874016
514 0.663384 -0.357441 -0.215252 -1.523059
515 0.058805 1.085078 1.090659 -0.840972
Series.apply(某个函数)
DataFrame.applymap(某个函数)
都是对单个值处理的