孤立森林（IsolationForest）算法对数据进行异常检测

AI信仰者

于 2021-07-25 10:57:05 发布

阅读量3.2k

点赞数 6

分类专栏：数据分析数据挖掘算法文章标签：聚类算法机器学习孤立森林 SVM

本文链接：https://blog.csdn.net/qq_30803353/article/details/118682167

版权

算法同时被 3 个专栏收录

37 篇文章 22 订阅

订阅专栏

数据挖掘

14 篇文章 4 订阅

订阅专栏

数据分析

6 篇文章 0 订阅

订阅专栏

1、摘要

本文主要讲解：使用孤立森林（IsolationForest）算法对数据进行异常检测
主要思路：

对数据进行处理，处理成算法能识别的二维数据
使用孤立森林（IsolationForest）算法对数据进行聚类，分为两类，异常与正常
生成网格点坐标矩阵
填充等高线轮廓
多种异常检测算法比较
三维聚类实现

2、数据介绍

列名如下：

3、相关技术

在孤立森林(iForest)中，异常被定义为“容易被孤立的离群点 (more likely to be separated)”，可以将其理解为分布稀疏且离密度高的群体较远的点。在特征空间里，分布稀疏的区域表示事件发生在该区域的概率很低，因而可以认为落在这些区域里的数据是异常的。孤立森林是一种适用于连续数据(Continuous numerical data)的无监督异常检测方法，即不需要有标记的样本来训练，但特征需要是连续的。对于如何查找哪些点容易被孤立(isolated)，iForest使用了一套非常高效的策略。在孤立森林中，递归地随机分割数据集，直到所有的样本点都是孤立的。在这种随机分割的策略下，异常点通常具有较短的路径。

直观上来讲，那些密度很高的簇是需要被切很多次才能被孤立，但是那些密度很低的点很容易就可以被孤立。
————————————————
版权声明：本文为CSDN博主「extremebingo」的原创文章，遵循CC 4.0 BY-SA版权协议，转载请附上原文出处链接及本声明。
原文链接：孤立森林（IsolationForest）算法

4、完整代码和步骤

代码输出如下：
在这里插入图片描述

# _*_coding:utf-8_*_
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import IsolationForest
import pandas as pd


src1 = r'D:\data\\'
src = r'D:\data\\异常检测\\'
params_adapter_6 = pd.read_csv(src1 + 'params_adapter_6.csv', usecols=[0, 2, 7, 8, 9, 10, 11])
params_adapter_6 = params_adapter_6.set_index('日期')
params_adapter_6.dropna(inplace=True)
groups = params_adapter_6.groupby(['宽'])
rng = np.random.RandomState(42)


def Generate_train_data(name, group):
    float_data = group
    # fit the model
    clf = IsolationForest(max_samples=100,
                          random_state=rng, contamination=0.01)
    clf.fit(float_data.values)
    y_pred = clf.predict(float_data.values)
    float_data.loc[:, 'category'] = y_pred.tolist()
    inlier = float_data[float_data['category'] == 1][['车速']].values
    outliers = float_data[float_data['category'] == -1][['车速']].values
    # 生成网格点坐标矩阵
    xx, yy = np.meshgrid(np.linspace(51, 187, 10), np.linspace(0, 10, 10))
    x1 = xx.ravel()
    # np.c_是按行连接两个矩阵，就是把两矩阵左右相加，要求行数相等
    pre_z = np.c_[x1, yy.ravel()]
    Z = clf.decision_function(pre_z)
    Z = Z.reshape(xx.shape)
    title = name[0] + '-Corrugated' + '-' + name[1] + '-Material-' + str(name[2]) + '-Width'
    plt.title(title)
    # contourf()会填充等高线轮廓
    plt.contourf(xx, yy, Z, camp=plt.cm.colors)
    b1 = plt.scatter(inlier[:, 0], inlier[:, 1], c='green',
                     s=20, edgecolor='k')
    c = plt.scatter(outliers[:, 0], outliers[:, 1], c='red',
                    s=20, edgecolor='k')
    plt.axis('tight')
    plt.xlim((51, 187))
    plt.ylim((0, 10))
    plt.xlabel('Speed')  # 设置x，y轴的标签
    plt.ylabel('OA')
    plt.legend([b1, c],
               ["normal", "abnormal"],
               loc="upper left")

    plt.show()


for name, group in groups:
    print(name)
    shape = group.shape[0]
    print(shape)
    if shape > 110:
        Generate_train_data(name, group)
    else:
        print(0)

三维聚类、效果不是很明显，如需直观的图请用pyecharts画

# _*_coding:utf-8_*_
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import axes3d
from sklearn.ensemble import IsolationForest
import pandas as pd


src1 = r'D:\data\\'

params_adapter_6 = pd.read_csv(src1 + 'params_adapter_6.csv', usecols=[0, 2, 7, 8, 9, 10, 11])
params_adapter_6 = params_adapter_6.set_index('日期')
params_adapter_6.dropna(inplace=True)
groups = params_adapter_6.groupby(['宽'])
rng = np.random.RandomState(42)


def Generate_train_data(name, group):
    float_data = group
    # fit the model
    clf = IsolationForest(max_samples=100,
                          random_state=rng, contamination=0.05)
    clf.fit(float_data.values)
    y_pred = clf.predict(float_data.values)
    float_data.loc[:, 'category'] = y_pred.tolist()
    inlier = float_data[float_data['category'] == 1][['车速']].values
    outliers = float_data[float_data['category'] == -1][['车速']].values
    # 生成网格点坐标矩阵
    xx, yy = np.meshgrid(np.linspace(51, 187, 10), np.linspace(0, 12, 10))
    x1 = xx.ravel()
    # np.c_是按行连接两个矩阵，就是把两矩阵左右相加，要求行数相等
    pre_z = np.c_[x1, yy.ravel(), yy.ravel()]
    Z = clf.decision_function(pre_z)
    Z = Z.reshape(xx.shape)
    # z = x ** 2 + y ** 2
    title = name[0] + '-' + name[1] + '-' + str(name[2])
    fig = plt.figure()
    ax = fig.gca(projection='3d')
    # X, Y, Z = axes3d.get_test_data(0.05)
    b1 = ax.scatter(inlier[:, 0], inlier[:, 1], inlier[:, 2], c='green',
                     edgecolor='face')
    c = ax.scatter(outliers[:, 0], outliers[:, 1], outliers[:, 2], c='red',
                    edgecolor='face')
    ax.plot_surface(xx, yy, Z, rstride=8, cstride=8, alpha=0.3)
    # 绘制等高线
    ax.contourf(xx, yy, Z, zdir='x', offset=187)
    ax.contourf(xx, yy, Z, zdir='y', offset=12)
    ax.contourf(xx, yy, Z, zdir='z', offset=12)

    ax.set_xlabel('X')
    ax.set_xlim(51, 187)
    ax.set_ylabel('Y')
    ax.set_ylim(0, 12)
    ax.set_zlabel('Z')
    ax.set_zlim(0, 12)

    plt.title(title)
    # contourf()会填充等高线轮廓
    plt.axis('tight')
    plt.legend([b1, c],
               ["inliers", "outliers"],
               loc="upper left")
    plt.show()


for name, group in groups:
    print(name)
    shape = group.shape[0]
    print(shape)
    if shape > 110:
        Generate_train_data(name, group)
    else:
        print(0)

使用孤立森林、支持向量机（SVM）、Robust 协方差估计、局部异常因子算法 Local Outlier Factor（LOF）分别对异常进行检测，并比较各自所画时间和效果

import time

import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
import pandas as pd
matplotlib.rcParams['contour.negative_linestyle'] = 'solid'
src1 = r'D:\data\xiexu\\'

params_adapter_6 = pd.read_csv(src1 + 'params_adapter_6.csv', usecols=[0, 2, 7, 8, 9, 10, 11])
params_adapter_6 = params_adapter_6.set_index('日期')
groups = params_adapter_6.groupby(['宽'])
rng = np.random.RandomState(42)

# Example settings
n_samples = 300
outliers_fraction = 0.15
n_outliers = int(outliers_fraction * n_samples)
n_inliers = n_samples - n_outliers

# define outlier/ anomaly detection methods to be compared
anomaly_algorithms = [
    ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
    ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel='rbf', gamma=0.1)),
    ("Isolation Forest", IsolationForest(contamination=outliers_fraction, random_state=42)),
    ("Local Outlier Factor", LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction))
]


def Generate_train_data(group):
    # define datasets
    float_data = group.values
    X_train = float_data[:300]
    # X_test = float_data[300:]
    datasets = [X_train]

    xx, yy = np.meshgrid(np.linspace(51, 187, 150), np.linspace(0, 12, 150))

    plt.figure(figsize=(len(anomaly_algorithms) * 2 + 3, 12.5))
    plt.subplots_adjust(left=0.02, right=0.98, bottom=0.001, top=0.96, wspace=0.05, hspace=0.01)

    # rng = np.random.RandomState(42)
    plot_num = 1
    for i_dataset, X in enumerate(datasets):
        # add outliers
        # outliers = rng.uniform(low=-6, high=6, size=(n_outliers, 2))
        # X = np.concatenate([X, outliers], axis=0)
        for name, algorithm in anomaly_algorithms:
            print(name, algorithm)
            t0 = time.time()
            algorithm.fit(X)
            t1 = time.time()
            plt.subplot(2, 2, plot_num)
            if i_dataset == 0:
                plt.title(name, size=18)

            # fit the data and tag outliers
            if name == 'Local Outlier Factor':
                y_pred = algorithm.fit_predict(X)
            else:
                y_pred = algorithm.fit(X).predict(X)

            # plot the levels lines and the points
            if name != "Local Outlier Factor":
                Z = algorithm.predict(np.c_[xx.ravel(), yy.ravel()])
                Z = Z.reshape(xx.shape)
                plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black')

            colors = np.array(["#377eb8", '#ff7f00'])
            plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[(y_pred + 1) // 2])

            plt.xlim(51, 187)
            plt.ylim(0, 12)
            plt.xticks(())
            plt.yticks(())
            plt.text(0.99, 0.01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                     transform=plt.gca().transAxes, size=15,
                     horizontalalignment='right')
            plot_num += 1

    plt.show()

for name, group in groups:
    print(name)
    if group.shape[0] > 300:
        Generate_train_data(group)
    else:
        print(0)