一、MAD介绍
MAD(median absolute deviation)绝对中位差
在统计学中,MAD是对单变量数值型数据的样本偏差的一种鲁棒性测量,即是用来描述单变量样本在定量数据中可变的一种标准。
from __future__ import division
from __future__ import print_function
import numpy as np
from sklearn.utils import check_array
from .base import BaseDetector
#检查X维度
def _check_dim(X):
if X.shape[1] != 1:
raise ValueError('MAD algorithm is just for univariate data. '
'Got Data with {} Dimensions.'.format(X.shape[1]))
class MAD(BaseDetector):
#初始化
def __init__(self, threshold=3.5):
# contamination is unneeded since threshold must be
# decided manually by the user
super(MAD, self).__init__()
if not isinstance(threshold, (float, int)):
raise TypeError(
'threshold must be a number. Got {}'.format(type(threshold)))
self.threshold = threshold
#定义变量进行拟合
def fit(self, X, y=None):
X = check_array(X, ensure_2d=False, force_all_finite=False)
_check_dim(X)
self._set_n_classes(y)
self.threshold_ = self.threshold
self.median_ = None # reset median after each call
self.median_diff_ = None # reset median_diff after each call
self.decision_scores_ = self.decision_function(X)
self._process_decision_scores()
return self
#调用_mad(X)计算异常值得分
def decision_function(self, X):
X = check_array(X, ensure_2d=False, force_all_finite=False)
_check_dim(X)
return self._mad(X)
#核心部分
def _mad(self, X):
obs = np.reshape(X, (-1, 1))
# `self.median` will be None only before `fit()` is called
self.median_ = np.nanmedian(obs) if self.median_ is None else self.median_
diff = np.abs(obs - self.median_)
self.median_diff_ = np.nanmedian(diff) if self.median_diff_ is None else self.median_diff_
return np.nan_to_num(np.ravel(0.6745 * diff / self.median_diff_))
#打标签并进行计算均值方差
def _process_decision_scores(self):
self.labels_ = (self.decision_scores_ > self.threshold).astype('int').ravel()
# calculate for predict_proba()
self._mu = np.nanmean(self.decision_scores_)
self._sigma = np.nanstd(self.decision_scores_)
return self