导入模块
import pandas as pd
import numpy as np
1.离散化和面元的划分
不指定面元的界限
cut()函数 根据数据的范围等分为指定数量面元
data = [12,30,34,66,43,56,55,28,97,78,24,88,87,80,5]
pd.cut(data,5)
'''返回
[(4.908, 23.4], (23.4, 41.8], (23.4, 41.8], (60.2, 78.6], (41.8, 60.2], ..., (23.4, 41.8], (78.6, 97.0], (78.6, 97.0], (78.6, 97.0], (4.908, 23.4]]
Length: 15
Categories (5, interval[float64]): [(4.908, 23.4] < (23.4, 41.8] < (41.8, 60.2] < (60.2, 78.6] < (78.6, 97.0]]
'''
qcut()函数 根据数据的分布情况,确保每个面元个体数相同,划分为指定数量面元
data = [12,30,34,66,43,56,55,28,97,78,24,88,87,80,5]
result = pd.qcut(data,5)
result
'''返回
[(4.999, 27.2], (27.2, 39.4], (27.2, 39.4], (60.0, 81.4], (39.4, 60.0], ..., (4.999, 27.2], (81.4, 97.0], (81.4, 97.0], (60.0, 81.4], (4.999, 27.2]]
Length: 15
Categories (5, interval[float64]): [(4.999, 27.2] < (27.2, 39.4] < (39.4, 60.0] < (60.0, 81.4] < (81.4, 97.0]]
'''
统计每个面元内的个体数量
pd.value_counts(result)
'''
(4.999, 27.2] 3
(27.2, 39.4] 3
(39.4, 60.0] 3
(60.0, 81.4] 3
(81.4, 97.0] 3
'''
指定面元划分区间
data = [12,30,34,66,43,56,55,28,97,78,24,88,87,80,5]
bins = [0,25,50,75,100]
result = pd.cut(data,bins)
result
'''
[(0, 25], (25, 50], (25, 50], (50, 75], (25, 50], ..., (0, 25], (75, 100], (75, 100], (75, 100], (0, 25]]
'''
查看面元
result.categories
'''
IntervalIndex([(0, 25], (25, 50], (50, 75], (75, 100]],
closed='right',
dtype='interval[int64]')
'''
查看每个元素所属的面元的索引
result.codes
'''
array([0, 1, 1, 2, 1, 2, 2, 1, 3, 3, 0, 3, 3, 3, 0], dtype=int8)
'''
统计每个面元内元素的数量
pd.value_counts(result)
'''
(75, 100] 5
(25, 50] 4
(0, 25] 3
(50, 75] 3
'''
为每个面元赋指定的标签
data = [12,30,34,66,43,56,55,28,97,78,24,88,87,80,5]
bin_names = ['D','C','B','A']
pd.cut(data,bins,labels=bin_names)
'''
['D', 'C', 'C', 'B', 'C', ..., 'D', 'A', 'A', 'A', 'D']
'''
2.异常值检测和过滤
创建一个包含3列、每一列1000各随机数的DataFrame对象,并查看其描述性统计量
randframe = pd.DataFrame(np.random.randn(1000,3)) #创建一个包含3列、每一列1000各随机数的DataFrame对象
randframe.describe() #查看每一列的描述性统计量
假设把比标准差大3倍的元素视为异常值
randframe.std() #求每一列的标准差
'''
0 1.014321
1 0.997436
2 1.033215
'''
借助any()函数对每一列筛选
any(1) 相当于any(True) 表示为真时
randframe[(np.abs(randframe)>(3*randframe.std())).any(1)]
参考:
法比奥·内利. Python数据分析实战:第2版.北京:人民邮电出版社, 2019.11.