python_异常值_EllipticEnvelope法和四分位差法
# 加载库
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs
# 创建爱模拟数据
# sklearn 中 make_blobs模块使用
# sklearn.datasets.make_blobs(n_samples=100, n_features=2, centers=3, cluster_std=1.0, center_box=(-10.0, 10.0), shuffle=True, random_state=None)
# 属性含义:
# n_samples: int, optional (default=100)
# The total number of points equally divided among clusters.
# 待生成的样本的总数。
# n_features: int, optional (default=2)
# The number of features for each sample.
# 每个样本的特征数。
# centers: int or array of shape [n_centers, n_features], optional (default=3)
# The number of centers to generate, or the fixed center locations.
# 要生成的样本中心(类别)数,或者是确定的中心点。
# cluster_std: float or sequence of floats, optional (default=1.0)
# The standard deviation of the clusters.
# 每个类别的方差,例如我们希望生成2类数据,其中一类比另一类具有更大的方差,可以将cluster_std设置为[1.0,3.0]。
features, _ = make_blobs(n_samples = 10,
n_features = 2,
centers = 1,
random_state = 1)
features
array([[-1.83198811, 3.52863145],
[-2.76017908, 5.55121358],
[-1.61734616, 4.98930508],
[-0.52579046, 3.3065986 ],
[ 0.08525186, 3.64528297],
[-0.79415228, 2.10495117],
[-1.34052081, 4.15711949],
[-1.98197711, 4.02243551],
[-2.18773166, 3.33352125],
[-0.19745197, 2.34634916]])
# # 将第一个值替换成异常值
features[0,0] = 10000
features[0,1] = 10000
features
# # contamination 污染指数 即清洁程度
outlier_detector = EllipticEnvelope(contamination=.1)
# # 拟合识别器
outlier_detector.fit(features)
# # 预测
outlier_detector.predict(features)
array([-1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
# 创建一个函数返回异常值下标
feature = features[:,0]
feature
#四分位差法
# # 创建一个函数返回异常值下标
def indicies_of_outliers(x):
q1, q3 = np.percentile(x, [25, 75])
iqr = q3 - q1
lower_bound = q1 - (iqr * 1.5)
upper_bound = q3 + (iqr * 1.5)
return np.where((x > upper_bound) | (x < lower_bound))
# 执行函数
indicies_of_outliers(feature)
# print(len(feature1))
(array([0], dtype=int64),)