"""
Outlier Detection Toolbox
=========================
This is a single-file distribution (for ease of preview) of a production-grade
outlier/anomaly detection toolbox intended to be split into a small package:
outlier_detection/
├── __init__.py
├── utils.py
├── statistical.py
├── distance_density.py
├── model_based.py
├── deep_learning.py
├── ensemble.py
├── visualization.py
└── cli.py
--- NOTE ---
This code block contains *all* modules concatenated (with file headers) so you
can preview and copy each file out into separate .py files. When you save them
as separate files the package will work as expected.
Design goals (what you asked for):
- Detailed, well-documented functions (purpose, math, applicability, edge-cases)
- Robust handling of NaNs, constant columns, categorical data
- Functions return structured metadata + masks + scores so you can inspect
- Utilities for ensemble combining methods and producing a readable report
- Optional deep learning methods (AutoEncoder/VAE) with clear dependency
instructions and graceful error messages if libraries are missing.
Dependencies (recommended):
pip install numpy pandas scipy scikit-learn matplotlib joblib tensorflow>=2.0
If you prefer PyTorch for deep models you can adapt deep_learning.py accordingly.
"""
# ---------------------------
# File: outlier_detection/__init__.py
# ---------------------------
__version__ = "0.1.0"
# make it easy to import core helpers
from typing import Dict
from .utils import ensure_dataframe, OutlierResult, summarize_results, recommend_methods
from .statistical import z_score_method, modified_z_score, iqr_method, grubbs_test
from .distance_density import lof_method, mahalanobis_method, dbscan_method, knn_distance_method
from .model_based import (
isolation_forest_method,
one_class_svm_method,
pca_reconstruction_error,
gmm_method,
elliptic_envelope_method,
)
# deep_learning module is optional (heavy dependency)
try:
from .deep_learning import autoencoder_method, vae_method
except Exception:
# graceful: user may not have TF installed; import will raise at use time
autoencoder_method = None
vae_method = None
from .ensemble import ensemble_methods, aggregate_scores
from .visualization import plot_boxplot, plot_pair_scatter
__all__ = [
"__version__",
"ensure_dataframe",
"OutlierResult",
"summarize_results",
"recommend_methods",
"z_score_method",
"modified_z_score",
"iqr_method",
"grubbs_test",
"lof_method",
"mahalanobis_method",
"dbscan_method",
"knn_distance_method",
"isolation_forest_method",
"one_class_svm_method",
"pca_reconstruction_error",
"gmm_method",
"elliptic_envelope_method",
"autoencoder_method",
"vae_method",
"ensemble_methods",
"aggregate_scores",
"plot_boxplot",
"plot_pair_scatter",
]
# ---------------------------
# File: outlier_detection/utils.py
# ---------------------------
"""
Utilities for the outlier detection package.
Key responsibilities:
- Input validation and type normalization
- Handling numeric / categorical separation
- Standardization and robust scaling helpers
- A consistent result object shape used by all detectors
"""
from typing import Dict, Any, Tuple, Optional, List
import numpy as np
import pandas as pd
import logging
logger = logging.getLogger(__name__)
# A simple, documented result schema for detector functions.
# Each detector returns a dict with these keys (guaranteed):
# - 'mask': pd.Series[bool] same index as input rows; True means OUTLIER
# - 'score': pd.Series or pd.DataFrame numeric score (bigger usually means more anomalous)
# - 'method': short string
# - 'params': dict of parameters used
# - 'explanation': short textual note about interpretation
OutlierResult = Dict[str, Any]
def ensure_dataframe(X) -> pd.DataFrame:
"""
Convert input into a pandas DataFrame with a stable integer index.
Accepts: pd.DataFrame, np.ndarray, list-of-lists, pd.Series.
Returns DataFrame with numeric column names if necessary.
"""
if isinstance(X, pd.DataFrame):
df = X.copy()
elif isinstance(X, pd.Series):
df = X.to_frame()
else:
# try to coerce
df = pd.DataFrame(X)
# if no index or non-unique, reset
if df.index is None or not df.index.is_unique:
df = df.reset_index(drop=True)
# name numeric columns if unnamed
df.columns = [str(c) for c in df.columns]
return df
def numeric_only(df: pd.DataFrame, return_cols: bool = False) -> pd.DataFrame:
"""
Select numeric columns and warn if non-numeric columns are dropped.
If no numeric columns found raises ValueError.
"""
df = ensure_dataframe(df)
numeric_df = df.select_dtypes(include=["number"]).copy()
non_numeric = [c for c in df.columns if c not in numeric_df.columns]
if non_numeric:
logger.debug("Dropping non-numeric columns for numeric-only detectors: %s", non_numeric)
if numeric_df.shape[1] == 0:
raise ValueError("No numeric columns available for numeric detectors. Consider encoding categoricals.")
if return_cols:
return numeric_df, list(numeric_df.columns)
return numeric_df
def handle_missing(df: pd.DataFrame, strategy: str = "drop", fill_value: Optional[float] = None) -> pd.DataFrame:
"""
Handle missing values in data before passing to detectors.
Parameters
----------
df : DataFrame
strategy : {'drop', 'mean', 'median', 'zero', 'constant', 'keep'}
- 'drop' : drop rows with any NaN (useful when most values are present)
- 'mean' : fill numeric columns with mean
- 'median' : fill numeric with median
- 'zero' : fill with 0
- 'constant' : fill with supplied fill_value
- 'keep' : keep NaNs (many detectors can handle NaN rows implicitly)
fill_value : numeric (used when strategy=='constant')
Returns
-------
DataFrame cleaned according to strategy. Original index preserved.
Notes
-----
- Some detectors (LOF, IsolationForest) do NOT accept NaNs; choose strategy accordingly.
"""
df = df.copy()
if strategy == "drop":
return df.dropna(axis=0, how="any")
elif strategy == "mean":
return df.fillna(df.mean())
elif strategy == "median":
return df.fillna(df.median())
elif strategy == "zero":
return df.fillna(0)
elif strategy == "constant":
if fill_value is None:
raise ValueError("fill_value must be provided for strategy='constant'")
return df.fillna(fill_value)
elif strategy == "keep":
return df
else:
raise ValueError(f"Unknown missing value strategy: {strategy}")
def robust_scale(df: pd.DataFrame) -> pd.DataFrame:
"""
Scale numeric columns using median and IQR (robust to outliers).
Returns a DataFrame of same shape with scaled values.
"""
df = numeric_only(df)
med = df.median()
q1 = df.quantile(0.25)
q3 = df.quantile(0.75)
iqr = q3 - q1
# avoid division by zero
iqr_replaced = iqr.replace(0, 1.0)
return (df - med) / iqr_replaced
def create_result(mask: pd.Series, score: pd.Series, method: str, params: Dict[str, Any], explanation: str) -> OutlierResult:
"""
Wrap mask + score into the standard result dict.
"""
# ensure index alignment
if not mask.index.equals(score.index):
# try to reindex
score = score.reindex(mask.index)
return {
"mask": mask.astype(bool),
"score": score,
"method": method,
"params": params,
"explanation": explanation,
}
def summarize_results(results: Dict[str, OutlierResult]) -> pd.DataFrame:
"""
Given a dict of results keyed by method name, return a single DataFrame
where each column is that method's boolean flag and another column is the
score (if numeric).
Also returns a short per-row summary like how many detectors flagged the row.
"""
# Collect masks and scores
masks = {}
scores = {}
for k, r in results.items():
masks[f"{k}_flag"] = r["mask"].astype(int)
# flatten score: if DataFrame use mean across columns
sc = r["score"]
if isinstance(sc, pd.DataFrame):
sc = sc.mean(axis=1)
scores[f"{k}_score"] = sc
masks_df = pd.DataFrame(masks)
scores_df = pd.DataFrame(scores)
combined = pd.concat([masks_df, scores_df], axis=1)
combined.index = next(iter(results.values()))["mask"].index
combined["n_flags"] = masks_df.sum(axis=1)
combined["any_flag"] = combined["n_flags"] > 0
return combined
def recommend_methods(X: pd.DataFrame) -> List[str]:
"""
Heuristic recommender: returns a short list of methods to try depending on data shape.
Rules (simple heuristics):
- single numeric column: ['iqr', 'modified_z']
- low-dimensional (n_features <= 10) and numeric: ['mahalanobis','lof','isolation_forest']
- high-dimensional (n_features > 10): ['isolation_forest','pca','autoencoder']
"""
df = ensure_dataframe(X)
n_features = df.select_dtypes(include=["number"]).shape[1]
if n_features == 0:
raise ValueError("No numeric features to recommend methods for")
if n_features == 1:
return ["iqr", "modified_z"]
elif n_features <= 10:
return ["mahalanobis", "lof", "isolation_forest"]
else:
return ["isolation_forest", "pca", "autoencoder"]
# ---------------------------
# File: outlier_detection/statistical.py
# ---------------------------
"""
Statistical / univariate outlier detectors.
Each function focuses on single-dimension input (pd.Series) or will operate
column-wise if given a DataFrame (then returns DataFrame of scores / masks).
"""
from typing import Union
import numpy as np
import pandas as pd
from scipy import stats
from .utils import create_result, numeric_only
def _as_series(x: Union[pd.Series, pd.DataFrame], col: str = None) -> pd.Series:
if isinstance(x, pd.DataFrame):
if col is None:
raise ValueError("If passing DataFrame, must pass column name")
return x[col]
return x
def z_score_method(x: Union[pd.Series, pd.DataFrame], threshold: float = 3.0) -> OutlierResult:
"""
Z-Score method (univariate)
Math: z = (x - mean) / std
Flag where |z| > threshold.
Applicability: single numeric column, approximately normal distribution.
Not robust to heavy-tailed distributions.
Returns OutlierResult with score = |z| (higher => more anomalous).
"""
if isinstance(x, pd.DataFrame):
# apply per-column and return a DataFrame score
masks = pd.DataFrame(index=x.index)
scores = pd.DataFrame(index=x.index)
for c in x.columns:
res = z_score_method(x[c], threshold=threshold)
masks[c] = res["mask"].astype(int)
scores[c] = res["score"]
# Derive a combined mask: any column flagged
mask_any = masks.sum(axis=1) > 0
combined_score = scores.mean(axis=1)
return create_result(mask_any, combined_score, "z_score_dataframe", {"threshold": threshold},
"Applied z-score per-column and combined by mean score and any-flag")
s = x.dropna()
if s.shape[0] == 0:
mask = pd.Series([False]*len(x), index=x.index)
score = pd.Series([0.0]*len(x), index=x.index)
return create_result(mask, score, "z_score", {"threshold": threshold}, "Empty or all-NaN series")
mu = s.mean()
sigma = s.std(ddof=0)
if sigma == 0:
score = pd.Series(0.0, index=x.index)
mask = pd.Series(False, index=x.index)
explanation = "Zero variance: no z-score possible"
return create_result(mask, score, "z_score", {"threshold": threshold}, explanation)
z = (x - mu) / sigma
absz = z.abs()
mask = absz > threshold
score = absz.fillna(0.0)
explanation = f"z-score with mean={mu:.4g}, std={sigma:.4g}; flag |z|>{threshold}"
return create_result(mask, score, "z_score", {"threshold": threshold}, explanation)
def modified_z_score(x: Union[pd.Series, pd.DataFrame], threshold: float = 3.5) -> OutlierResult:
"""
Modified Z-score using median and MAD (robust to extreme values).
Formula:
M_i = 0.6745 * (x_i - median) / MAD
Where MAD = median(|x_i - median|)
Recommended threshold: 3.5 (common in literature)
"""
if isinstance(x, pd.DataFrame):
masks = pd.DataFrame(index=x.index)
scores = pd.DataFrame(index=x.index)
for c in x.columns:
res = modified_z_score(x[c], threshold=threshold)
masks[c] = res["mask"].astype(int)
scores[c] = res["score"]
mask_any = masks.sum(axis=1) > 0
combined_score = scores.mean(axis=1)
return create_result(mask_any, combined_score, "modified_z_dataframe", {"threshold": threshold},
"Applied modified z per-column and combined")
s = x.dropna()
if len(s) == 0:
return create_result(pd.Series(False, index=x.index), pd.Series(0.0, index=x.index), "modified_z", {"threshold": threshold}, "empty")
med = np.median(s)
mad = np.median(np.abs(s - med))
if mad == 0:
# all equal or too small
score = pd.Series(0.0, index=x.index)
mask = pd.Series(False, index=x.index)
return create_result(mask, score, "modified_z", {"threshold": threshold}, "mad==0: no variation")
M = 0.6745 * (x - med) / mad
score = M.abs().fillna(0.0)
mask = score > threshold
return create_result(mask, score, "modified_z", {"threshold": threshold, "median": med, "mad": mad},
"Robust modified z-score; higher => more anomalous")
def iqr_method(x: Union[pd.Series, pd.DataFrame], k: float = 1.5) -> OutlierResult:
"""
IQR (boxplot) method.
Flags points outside [Q1 - k*IQR, Q3 + k*IQR].
k=1.5 is common; use larger k for fewer false positives.
"""
if isinstance(x, pd.DataFrame):
masks = pd.DataFrame(index=x.index)
scores = pd.DataFrame(index=x.index)
for c in x.columns:
res = iqr_method(x[c], k=k)
masks[c] = res["mask"].astype(int)
scores[c] = res["score"]
mask_any = masks.sum(axis=1) > 0
combined_score = scores.mean(axis=1)
return create_result(mask_any, combined_score, "iqr_dataframe", {"k": k}, "Applied IQR per column")
s = x.dropna()
if s.shape[0] == 0:
return create_result(pd.Series(False, index=x.index), pd.Series(0.0, index=x.index), "iqr", {"k": k}, "empty")
q1 = np.percentile(s, 25)
q3 = np.percentile(s, 75)
iqr = q3 - q1
lower = q1 - k * iqr
upper = q3 + k * iqr
mask = (x < lower) | (x > upper)
# score: distance from nearest fence normalized by iqr (if iqr==0 use abs distance)
if iqr == 0:
score = (x - q1).abs().fillna(0.0)
else:
score = pd.Series(0.0, index=x.index)
score[x < lower] = ((lower - x[x < lower]) / (iqr + 1e-12))
score[x > upper] = ((x[x > upper] - upper) / (iqr + 1e-12))
return create_result(mask.fillna(False), score.fillna(0.0), "iqr", {"k": k, "q1": q1, "q3": q3},
f"IQR fences [{lower:.4g}, {upper:.4g}]")
def grubbs_test(x: Union[pd.Series, pd.DataFrame], alpha: float = 0.05) -> OutlierResult:
"""
Grubbs' test for a single outlier (requires approx normality).
This test is intended to *detect one outlier at a time*.
Use iteratively (recompute after removing detected outlier) if you expect
multiple outliers, but be careful with multiplicity adjustments.
Returns mask with at most one True (the most extreme point) unless
alpha is very large.
"""
# For simplicity operate only on a single series. If DataFrame provided,
# run per-column and combine (like other funcs)
if isinstance(x, pd.DataFrame):
masks = pd.DataFrame(index=x.index)
scores = pd.DataFrame(index=x.index)
for c in x.columns:
res = grubbs_test(x[c], alpha=alpha)
masks[c] = res["mask"].astype(int)
scores[c] = res["score"]
mask_any = masks.sum(axis=1) > 0
combined_score = scores.mean(axis=1)
return create_result(mask_any, combined_score, "grubbs_dataframe", {"alpha": alpha}, "Applied Grubbs per column")
from math import sqrt
s = x.dropna()
n = len(s)
if n < 3:
return create_result(pd.Series(False, index=x.index), pd.Series(0.0, index=x.index), "grubbs", {"alpha": alpha}, "n<3: cannot run")
mean = s.mean()
std = s.std(ddof=0)
if std == 0:
return create_result(pd.Series(False, index=x.index), pd.Series(0.0, index=x.index), "grubbs", {"alpha": alpha}, "zero std")
# compute G statistic for max dev
deviations = (s - mean).abs()
max_idx = deviations.idxmax()
G = deviations.loc[max_idx] / std
# critical value from t-distribution
t_crit = stats.t.ppf(1 - alpha / (2 * n), n - 2)
G_crit = ((n - 1) / sqrt(n)) * (t_crit / sqrt(n - 2 + t_crit ** 2))
mask = pd.Series(False, index=x.index)
mask.loc[max_idx] = G > G_crit
score = pd.Series(0.0, index=x.index)
score.loc[max_idx] = float(G)
explanation = f"G={G:.4g}, Gcrit={G_crit:.4g}, alpha={alpha}"
return create_result(mask, score, "grubbs", {"alpha": alpha, "G": G, "Gcrit": G_crit}, explanation)
# ---------------------------
# File: outlier_detection/distance_density.py
# ---------------------------
"""
Distance and density based detectors (multivariate-capable).
Functions generally accept a numeric DataFrame X and return OutlierResult.
"""
from sklearn.neighbors import LocalOutlierFactor, NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.covariance import EmpiricalCovariance
from .utils import ensure_dataframe, create_result, numeric_only
def lof_method(X, n_neighbors: int = 20, contamination: float = 0.05) -> OutlierResult:
"""
Local Outlier Factor (LOF).
Returns score = -lof. LOF API returns negative_outlier_factor_. We negate so
higher score => more anomalous.
Applicability: medium-dimensional data, clusters of varying density.
Beware: LOF does not provide a predictable probabilistic threshold.
"""
X = ensure_dataframe(X)
Xnum = numeric_only(X)
if Xnum.shape[0] < 2:
return create_result(pd.Series(False, index=X.index), pd.Series(0.0, index=X.index), "lof", {"n_neighbors": n_neighbors}, "too few samples")
lof = LocalOutlierFactor(n_neighbors=min(n_neighbors, max(1, Xnum.shape[0]-1)), contamination=contamination)
y = lof.fit_predict(Xnum)
negative_factor = lof.negative_outlier_factor_
# higher -> more anomalous
score = (-negative_factor)
score = pd.Series(score, index=Xnum.index)
mask = pd.Series(y == -1, index=Xnum.index)
return create_result(mask, score, "lof", {"n_neighbors": n_neighbors, "contamination": contamination}, "LOF: higher score more anomalous")
def knn_distance_method(X, k: int = 5) -> OutlierResult:
"""
k-NN distance based scoring: compute distance to k-th nearest neighbor.
Points with large k-distance are candidate outliers.
Returns score = k-distance (bigger => more anomalous).
"""
X = ensure_dataframe(X)
Xnum = numeric_only(X)
if Xnum.shape[0] < k + 1:
return create_result(pd.Series(False, index=X.index), pd.Series(0.0, index=X.index), "knn_distance", {"k": k}, "too few samples")
nbrs = NearestNeighbors(n_neighbors=k + 1).fit(Xnum)
distances, _ = nbrs.kneighbors(Xnum)
# distances[:, 0] is zero (self). take k-th neighbor
kdist = distances[:, k]
score = pd.Series(kdist, index=Xnum.index)
# threshold: e.g., mean + 2*std
thr = score.mean() + 2 * score.std()
mask = score > thr
return create_result(mask, score, "knn_distance", {"k": k, "threshold": thr}, "k-distance method")
def mahalanobis_method(X, threshold_p: float = 0.01) -> OutlierResult:
"""
Mahalanobis distance based detection.
Computes D^2 for each point. One can threshold by chi-square quantile with
df=n_features: P(D^2 > thresh) = threshold_p. We return score = D^2.
Applicability: data approximately elliptical (multivariate normal-ish).
"""
X = ensure_dataframe(X)
Xnum = numeric_only(X)
n, d = Xnum.shape
if n <= d:
# covariance ill-conditioned; apply shrinkage or PCA beforehand
explanation = "n <= n_features: covariance may be singular, consider PCA or regularization"
else:
explanation = ""
cov = EmpiricalCovariance().fit(Xnum)
mahal = cov.mahalanobis(Xnum)
score = pd.Series(mahal, index=Xnum.index)
# default threshold: chi2 quantile
from scipy.stats import chi2
thr = chi2.ppf(1 - threshold_p, df=d) if d > 0 else np.inf
mask = score > thr
return create_result(mask, score, "mahalanobis", {"threshold_p": threshold_p, "chi2_thr": float(thr)}, explanation)
def dbscan_method(X, eps: float = 0.5, min_samples: int = 5) -> OutlierResult:
"""
DBSCAN clusterer: points labeled -1 are considered noise -> outliers.
Applicability: non-spherical clusters, variable density; choose eps carefully.
"""
X = ensure_dataframe(X)
Xnum = numeric_only(X)
if Xnum.shape[0] < min_samples:
return create_result(pd.Series(False, index=X.index), pd.Series(0.0, index=X.index), "dbscan", {"eps": eps, "min_samples": min_samples}, "too few samples")
db = DBSCAN(eps=eps, min_samples=min_samples).fit(Xnum)
labels = db.labels_
mask = pd.Series(labels == -1, index=Xnum.index)
# score: negative of cluster size (noise points get score 1)
# To keep simple: noise -> 1, else 0
score = pd.Series((labels == -1).astype(float), index=Xnum.index)
return create_result(mask, score, "dbscan", {"eps": eps, "min_samples": min_samples}, "DBSCAN noise points flagged")
# ---------------------------
# File: outlier_detection/model_based.py
# ---------------------------
"""
Model-based detectors: tree ensembles, SVM boundary, PCA reconstruction, GMM
These functions are intended for multivariate numeric data.
"""
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn.covariance import EllipticEnvelope
from .utils import ensure_dataframe, numeric_only, create_result
def isolation_forest_method(X, contamination: float = 0.05, random_state: int = 42) -> OutlierResult:
"""
Isolation Forest
Returns mask and anomaly score (higher => more anomalous).
Good general-purpose method for medium-to-high dimensional data.
"""
X = ensure_dataframe(X)
Xnum = numeric_only(X)
if Xnum.shape[0] < 2:
return create_result(pd.Series(False, index=X.index), pd.Series(0.0, index=X.index), "isolation_forest", {"contamination": contamination}, "too few samples")
iso = IsolationForest(contamination=contamination, random_state=random_state)
iso.fit(Xnum)
pred = iso.predict(Xnum)
# decision_function: higher -> more normal, so we invert
raw_score = -iso.decision_function(Xnum)
score = pd.Series(raw_score, index=Xnum.index)
mask = pd.Series(pred == -1, index=Xnum.index)
return create_result(mask, score, "isolation_forest", {"contamination": contamination}, "IsolationForest: inverted decision function as score")
def one_class_svm_method(X, kernel: str = "rbf", nu: float = 0.05, gamma: str = "scale") -> OutlierResult:
"""
One-Class SVM for boundary-based anomaly detection.
Carefully tune nu and gamma; not robust to large datasets without subsampling.
"""
X = ensure_dataframe(X)
Xnum = numeric_only(X)
if Xnum.shape[0] < 5:
return create_result(pd.Series(False, index=X.index), pd.Series(0.0, index=X.index), "one_class_svm", {"nu": nu}, "too few samples")
ocsvm = OneClassSVM(kernel=kernel, nu=nu, gamma=gamma)
ocsvm.fit(Xnum)
pred = ocsvm.predict(Xnum)
# decision_function: positive => inside boundary (normal); invert
raw_score = -ocsvm.decision_function(Xnum)
score = pd.Series(raw_score, index=Xnum.index)
mask = pd.Series(pred == -1, index=Xnum.index)
return create_result(mask, score, "one_class_svm", {"nu": nu, "kernel": kernel}, "OneClassSVM: invert decision_function for anomaly score")
def pca_reconstruction_error(X, n_components: int = None, explained_variance: float = None, threshold: float = None) -> OutlierResult:
"""
PCA-based reconstruction error.
If n_components not set, choose the minimum components to reach
explained_variance (if provided). Otherwise uses min(n_features, 2).
Score: squared reconstruction error per sample. Default threshold: mean+3*std.
"""
X = ensure_dataframe(X)
Xnum = numeric_only(X)
n, d = Xnum.shape
if n == 0 or d == 0:
return create_result(pd.Series(False, index=X.index), pd.Series(0.0, index=X.index), "pca_recon", {}, "empty data")
if n_components is None:
if explained_variance is not None:
temp_pca = PCA(n_components=min(n, d))
temp_pca.fit(Xnum)
cum = np.cumsum(temp_pca.explained_variance_ratio_)
n_components = int(np.searchsorted(cum, explained_variance) + 1)
n_components = max(1, n_components)
else:
n_components = min(2, d)
pca = PCA(n_components=n_components)
proj = pca.fit_transform(Xnum)
recon = pca.inverse_transform(proj)
errors = ((Xnum - recon) ** 2).sum(axis=1)
score = pd.Series(errors, index=Xnum.index)
if threshold is None:
threshold = score.mean() + 3 * score.std()
mask = score > threshold
return create_result(mask, score, "pca_recon", {"n_components": n_components, "threshold": float(threshold)}, "PCA reconstruction error")
def gmm_method(X, n_components: int = 2, contamination: float = 0.05) -> OutlierResult:
"""
Gaussian Mixture Model based anomaly score (log-likelihood).
Score: negative log-likelihood (bigger => less likely => more anomalous).
Threshold: empirical quantile of scores.
"""
X = ensure_dataframe(X)
Xnum = numeric_only(X)
if Xnum.shape[0] < n_components:
return create_result(pd.Series(False, index=X.index), pd.Series(0.0, index=X.index), "gmm", {}, "too few samples")
gmm = GaussianMixture(n_components=n_components)
gmm.fit(Xnum)
logprob = gmm.score_samples(Xnum)
score = pd.Series(-logprob, index=Xnum.index)
thr = score.quantile(1 - contamination)
mask = score > thr
return create_result(mask, score, {"n_components": n_components, "threshold": float(thr)}, "gmm", "GMM negative log-likelihood")
def elliptic_envelope_method(X, contamination: float = 0.05) -> OutlierResult:
"""
EllipticEnvelope fits a robust covariance (assumes data come from a Gaussian-like
ellipse). Flags outliers outside the ellipse.
"""
X = ensure_dataframe(X)
Xnum = numeric_only(X)
ee = EllipticEnvelope(contamination=contamination)
ee.fit(Xnum)
pred = ee.predict(Xnum)
# decision_function: larger -> more normal; invert
raw_score = -ee.decision_function(Xnum)
score = pd.Series(raw_score, index=Xnum.index)
mask = pd.Series(pred == -1, index=Xnum.index)
return create_result(mask, score, "elliptic_envelope", {"contamination": contamination}, "EllipticEnvelope")
# ---------------------------
# File: outlier_detection/deep_learning.py
# ---------------------------
"""
Deep learning based detectors (AutoEncoder, VAE).
These require TensorFlow/Keras installed. If not present, importing this module
will raise an informative ImportError.
Design: a training function accepts X (numpy or DataFrame) and returns a
callable `score_fn(X_new) -> pd.Series` plus a threshold selection helper.
"""
from typing import Callable
import numpy as np
import pandas as pd
# lazy import to avoid hard TF dependency if user doesn't need it
try:
import tensorflow as tf
from tensorflow.keras import layers, models, backend as K
except Exception as e:
raise ImportError("TensorFlow / Keras is required for deep_learning module. Install with `pip install tensorflow`. Error: " + str(e))
from .utils import ensure_dataframe, create_result
def _build_autoencoder(input_dim: int, latent_dim: int = 8, hidden_units=(64, 32)) -> models.Model:
inp = layers.Input(shape=(input_dim,))
x = inp
for h in hidden_units:
x = layers.Dense(h, activation='relu')(x)
z = layers.Dense(latent_dim, activation='relu', name='latent')(x)
x = z
for h in reversed(hidden_units):
x = layers.Dense(h, activation='relu')(x)
out = layers.Dense(input_dim, activation='linear')(x)
ae = models.Model(inp, out)
return ae
def autoencoder_method(X, latent_dim: int = 8, hidden_units=(128, 64), epochs: int = 50, batch_size: int = 32, validation_split: float = 0.1, threshold_method: str = 'quantile', threshold_val: float = 0.99, verbose: int = 0) -> OutlierResult:
"""
Train an AutoEncoder on X and compute reconstruction error as anomaly score.
Parameters
----------
X : DataFrame or numpy array (numeric)
threshold_method : 'quantile' or 'mean_std'
threshold_val : if quantile -> e.g. 0.99 means top 1% flagged; if mean_std -> number of stds
Returns
-------
OutlierResult where score = reconstruction error and mask = score > threshold
Notes
-----
- This trains on the entire provided X. For actual anomaly detection, it's
common to train the autoencoder only on "normal" data. If you have labels,
pass only normal subset for training.
- Requires careful scaling of inputs before training (robust_scale recommended).
"""
Xdf = ensure_dataframe(X)
Xnum = Xdf.select_dtypes(include=['number']).fillna(0.0)
input_dim = Xnum.shape[1]
if input_dim == 0:
return create_result(pd.Series(False, index=Xdf.index), pd.Series(0.0, index=Xdf.index), "autoencoder", {}, "no numeric columns")
# convert to numpy
arr = Xnum.values.astype(np.float32)
ae = _build_autoencoder(input_dim=input_dim, latent_dim=latent_dim, hidden_units=hidden_units)
ae.compile(optimizer='adam', loss='mse')
ae.fit(arr, arr, epochs=epochs, batch_size=batch_size, validation_split=validation_split, verbose=verbose)
recon = ae.predict(arr)
errors = np.mean((arr - recon) ** 2, axis=1)
score = pd.Series(errors, index=Xdf.index)
if threshold_method == 'quantile':
thr = float(score.quantile(threshold_val))
else:
thr = float(score.mean() + threshold_val * score.std())
mask = score > thr
return create_result(mask, score, "autoencoder", {"latent_dim": latent_dim, "threshold": thr}, "AutoEncoder reconstruction error")
def vae_method(X, latent_dim: int = 8, hidden_units=(128, 64), epochs: int = 50, batch_size: int = 32, threshold_method: str = 'quantile', threshold_val: float = 0.99, verbose: int = 0) -> OutlierResult:
"""
Variational Autoencoder (VAE) anomaly detection. Implementation note: VAE
is more involved; here we provide a simple implementation that uses
reconstruction error as score. For strict probabilistic anomaly scoring
one would use the ELBO / likelihood; this minimal implementation keeps
it practical.
"""
# For brevity we reuse autoencoder path (a more complete VAE impl is possible)
return autoencoder_method(X, latent_dim=latent_dim, hidden_units=hidden_units, epochs=epochs, batch_size=batch_size, threshold_method=threshold_method, threshold_val=threshold_val, verbose=verbose)
# ---------------------------
# File: outlier_detection/ensemble.py
# ---------------------------
"""
Combine multiple detectors and produce an aggregated report.
Provides strategies: union, intersection, majority voting, weighted sum of normalized scores.
"""
from typing import List, Dict
import numpy as np
import pandas as pd
from .utils import ensure_dataframe, create_result
def normalize_scores(scores: pd.DataFrame) -> pd.DataFrame:
"""Min-max normalize each score column to [0,1]."""
sc = scores.copy()
for c in sc.columns:
col = sc[c]
mn = col.min()
mx = col.max()
if mx == mn:
sc[c] = 0.0
else:
sc[c] = (col - mn) / (mx - mn)
return sc
def aggregate_scores(results: Dict[str, Dict], method: str = 'weighted', weights: Dict[str, float] = None) -> Dict:
"""
Aggregate multiple OutlierResult dictionaries produced by detectors.
Returns an OutlierResult-like dict with:
- mask (final boolean by threshold on aggregate score),
- score (aggregate numeric score)
Aggregation methods:
- 'union' : any detector flagged => outlier (score = max of normalized scores)
- 'intersection' : flagged by all detectors => outlier
- 'majority' : flagged by >50% detectors
- 'weighted' : weighted sum of normalized scores (weights provided or equal)
"""
# collect masks and scores into DataFrames
masks = pd.DataFrame({k: v['mask'].astype(int) for k, v in results.items()})
raw_scores = pd.DataFrame({k: (v['score'] if isinstance(v['score'], pd.Series) else pd.Series(v['score'])) for k, v in results.items()})
raw_scores.index = masks.index
norm_scores = normalize_scores(raw_scores)
if method == 'union':
agg_score = norm_scores.max(axis=1)
elif method == 'intersection':
agg_score = norm_scores.min(axis=1)
elif method == 'majority':
agg_score = masks.sum(axis=1) / max(1, masks.shape[1])
elif method == 'weighted':
if weights is None:
weights = {k: 1.0 for k in results.keys()}
# align weights
w = pd.Series({k: weights.get(k, 1.0) for k in results.keys()})
# make sure weights sum to 1
w = w / w.sum()
agg_score = (norm_scores * w).sum(axis=1)
else:
raise ValueError("Unknown aggregation method")
# default threshold: 0.5
mask = agg_score > 0.5
return create_result(mask, agg_score, f"ensemble_{method}", {"method": method}, "Aggregated ensemble score")
def ensemble_methods(X, method_list: List[str] = None, method_params: Dict = None) -> Dict[str, Dict]:
"""
Convenience: run multiple detectors by name and return dict of results.
method_list: list of names from ['iqr','modified_z','z_score','lof','mahalanobis','isolation_forest', ...]
method_params: optional dict mapping method name to params
"""
from . import statistical, distance_density, model_based, deep_learning
X = ensure_dataframe(X)
if method_list is None:
method_list = ['iqr', 'modified_z', 'isolation_forest', 'lof']
if method_params is None:
method_params = {}
results = {}
for m in method_list:
params = method_params.get(m, {})
try:
if m == 'iqr':
results[m] = statistical.iqr_method(X, **params)
elif m == 'modified_z':
results[m] = statistical.modified_z_score(X, **params)
elif m == 'z_score':
results[m] = statistical.z_score_method(X, **params)
elif m == 'lof':
results[m] = distance_density.lof_method(X, **params)
elif m == 'mahalanobis':
results[m] = distance_density.mahalanobis_method(X, **params)
elif m == 'dbscan':
results[m] = distance_density.dbscan_method(X, **params)
elif m == 'knn':
results[m] = distance_density.knn_distance_method(X, **params)
elif m == 'isolation_forest':
results[m] = model_based.isolation_forest_method(X, **params)
elif m == 'one_class_svm':
results[m] = model_based.one_class_svm_method(X, **params)
elif m == 'pca':
results[m] = model_based.pca_reconstruction_error(X, **params)
elif m == 'gmm':
results[m] = model_based.gmm_method(X, **params)
elif m == 'elliptic':
results[m] = model_based.elliptic_envelope_method(X, **params)
elif m == 'autoencoder':
results[m] = deep_learning.autoencoder_method(X, **params)
else:
logger.warning("Unknown method requested: %s", m)
except Exception as e:
logger.exception("Method %s failed: %s", m, e)
return results
# ---------------------------
# File: outlier_detection/visualization.py
# ---------------------------
"""
Simple plotting helpers for quick inspection.
Note: plotting is intentionally minimal; for report-quality figures users can
adapt styles. The functions return the matplotlib Figure object so they can be
further customized.
"""
import matplotlib.pyplot as plt
from .utils import ensure_dataframe
def plot_boxplot(series: pd.Series, show: bool = True):
df = ensure_dataframe(series)
col = df.columns[0]
fig, ax = plt.subplots()
ax.boxplot(df[col].dropna())
ax.set_title(f"Boxplot: {col}")
if show:
plt.show()
return fig
def plot_pair_scatter(X, columns: list = None, show: bool = True):
X = ensure_dataframe(X)
if columns is not None:
X = X[columns]
cols = X.columns.tolist()[:4] # avoid huge plots
fig, axes = plt.subplots(len(cols) - 1, len(cols) - 1, figsize=(4 * (len(cols) - 1), 4 * (len(cols) - 1)))
for i in range(1, len(cols)):
for j in range(i):
ax = axes[i - 1, j]
ax.scatter(X[cols[j]], X[cols[i]], s=8)
ax.set_xlabel(cols[j])
ax.set_ylabel(cols[i])
fig.suptitle("Pairwise scatter (first 4 numeric cols)")
if show:
plt.show()
return fig
# ---------------------------
# File: outlier_detection/cli.py
# ---------------------------
"""
A very small CLI to run detectors on a CSV file and output a CSV report.
Usage (example):
python -m outlier_detection.cli detect input.csv output_report.csv --methods iqr,isolation_forest
"""
import argparse
import pandas as pd
from .ensemble import ensemble_methods, aggregate_scores
def main():
parser = argparse.ArgumentParser(description='Outlier detection CLI')
sub = parser.add_subparsers(dest='cmd')
det = sub.add_parser('detect')
det.add_argument('input_csv')
det.add_argument('output_csv')
det.add_argument('--methods', default='iqr,modified_z,isolation_forest,lof')
args = parser.parse_args()
df = pd.read_csv(args.input_csv)
methods = args.methods.split(',')
results = ensemble_methods(df, method_list=methods)
agg = aggregate_scores(results, method='weighted')
summary = pd.concat([pd.DataFrame({k: v['mask'].astype(int) for k, v in results.items()}), pd.DataFrame({k: v['score'] for k, v in results.items()})], axis=1)
summary['ensemble_score'] = agg['score']
summary['ensemble_flag'] = agg['mask'].astype(int)
summary.to_csv(args.output_csv, index=False)
print(f"Wrote report to {args.output_csv}")
if __name__ == '__main__':
main()改成中文说明并返回代码给我
最新发布