一、孤立森林(Isolation Forest)算法
算法见C:\Program Files\Anaconda3\Lib\site-packages\sklearn\ensemble\iforest.py:
#n_estimators : int, optional (default=100) 森林中树的颗数
#max_samples : int or float, optional (default=”auto”) 每棵树的样本个数或比例
#contamination : float in (0., 0.5), optional (default=0.1) 关键参数,用户设置样本中异常点的比例
#max_features : int or float, optional (default=1.0) 每棵树的特征个数或比例
class IsolationForest(BaseBagging, OutlierMixin):
def __init__(self, n_estimators=100, max_samples="auto", contamination="legacy",
max_features=1., bootstrap=False, n_jobs=None, behaviour='old',
random_state=None, verbose=0):
def _set_oob_score(self, X, y):
#无监督拟合
def fit(self, X, y=None, sample_weight=None):
#返回值:+1 表示正常样本,-1表示异常样本。
def predict(self, X):
#返回样本的异常评分。值越小表示越有可能是异常样本。
def decision_function(self, X):
def score_samples(self, X):
示例:
from sklearn.ensemble import IsolationForest
#X为样本集
algorithm = IsolationForest(contamination=0.05, random_state=42)
y_pred = algorithm.fit(X).predict(X)
二、DBSCAN密度聚类
算法见C:\Program Files\Anaconda3\Lib\site-packages\sklearn\cluster\dbscan.py:
def dbscan(X, eps=0.5, min_samples=5, metric='minkowski', metric_params=None,
algorithm='auto', leaf_size=30, p=2, sample_weight=None,
n_jobs=None):
class DBSCAN(BaseEstimator, ClusterMixin):
def __init__(self, eps=0.5, min_samples=5, metric='euclidean',
metric_params=None, algorithm='auto', leaf_size=30, p=None,
n_jobs=None):
#拟合
def fit(self, X, y=None, sample_weight=None):
...
clust = dbscan(X, sample_weight=sample_weight,**self.get_params())
#拟合并返回标签
def fit_predict(self, X, y=None, sample_weight=None):
self.fit(X, sample_weight=sample_weight)
return self.labels_
示例:
from sklearn.cluster import DBSCAN
#X为样本集
clustering = DBSCAN(eps=0.3, min_samples=2)
y_pred = clustering.fit_predict(X)