简单的基于孤立森林的时间序列数据异常检测（Python）

哥廷根数学学派

于 2024-09-03 09:25:33 发布

阅读量512

点赞数 8

分类专栏：深度学习机器学习信号处理文章标签： python 开发语言人工智能机器学习 cnn

本文链接：https://blog.csdn.net/weixin_39402231/article/details/141848792

版权

信号处理同时被 3 个专栏收录

350 篇文章 34 订阅

订阅专栏

机器学习

194 篇文章 1 订阅

订阅专栏

深度学习

155 篇文章 5 订阅

订阅专栏

import matplotlib.pyplot as plt
import pandas as pd
from dataset import load_data, filter_data
from datetime import datetime
from sklearn.ensemble import IsolationForest


def iqr_bounds(scores,k=1.5):
    q1 = scores.quantile(0.25)
    q3 = scores.quantile(0.75)
    iqr = q3 - q1
    lower_bound=(q1 - k * iqr)
    upper_bound=(q3 + k * iqr)
    print("Lower bound:{} \nUpper bound:{}".format(lower_bound,upper_bound))
    return lower_bound,upper_bound


def view_anomalies(df):


    #Fixed contamination value
    clf=IsolationForest(n_estimators=10, max_samples='auto', contamination=float(.04), \
                            max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
    clf.fit(df[['value']])
    df['scores']=clf.decision_function(df[['value']])
    df['anomaly']=clf.predict(df[['value']])
    df.head()
    df.loc[df['anomaly'] == 1,'anomaly'] = 0
    df.loc[df['anomaly'] == -1,'anomaly'] = -1
    print(df['anomaly'].value_counts())


    # visualization
    fig, ax = plt.subplots(figsize=(10,6))
    a = df.loc[df['anomaly'] == -1, ['timestamp', 'value']] #anomaly
    ax.plot(df['timestamp'], df['value'], color='blue', label = 'Normal')
    ax.scatter(a['timestamp'],a['value'], color='red', label = 'Anomaly')
    plt.legend()
    plt.show();


    df['scores'].hist()
    plt.show();


    #IQR-based  
    print()
    lower_bound,upper_bound=iqr_bounds(df['scores'],k=2)


    df['anomaly']=0
    df['anomaly']=(df['scores'] < lower_bound) |(df['scores'] > upper_bound)
    df['anomaly']=df['anomaly'].astype(int)
    fig, ax = plt.subplots(figsize=(10,6))
    a = df.loc[df['anomaly'] == 1, ['timestamp', 'value']] #anomaly
    ax.plot(df['timestamp'], df['value'], color='blue', label = 'Normal')
    ax.scatter(a['timestamp'],a['value'], color='red', label = 'Anomaly')
    plt.title('IQR-based');
    plt.legend()
    plt.show();


    print("Percentage of anomalies in data: {:.2f}".format((len(df.loc[df['anomaly']==1])/len(df))*100))
    return df


def validate_model(full_df):
    start_date = datetime.strptime('14-02-17 00:00:00', '%y-%m-%d %H:%M:%S')
    end_date = datetime.strptime('14-02-17 23:59:59', '%y-%m-%d %H:%M:%S')


    df=full_df.loc[ (full_df['timestamp'] > start_date) & (full_df['timestamp'] < end_date) ]
    # Using graph_objects
    df.plot(x='timestamp', y='value', figsize=(12,6))
    plt.xlabel('Date time')
    plt.ylabel('CPU Utilization')
    plt.title('Distribution of Validation Data');
    plt.show();


    #Fixed contamination value
    clf=IsolationForest(n_estimators=10, max_samples='auto', contamination=float(.04), \
                            max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
    clf.fit(df[['value']])
    df['scores']=clf.decision_function(df[['value']])
    df['anomaly']=clf.predict(df[['value']])
    df.loc[df['anomaly'] == 1,'anomaly'] = 0
    df.loc[df['anomaly'] == -1,'anomaly'] = -1
    
    fig, ax = plt.subplots(figsize=(10,6))
    a = df.loc[df['anomaly'] == -1, ['timestamp', 'value']] #anomaly
    ax.plot(df['timestamp'], df['value'], color='blue', label = 'Normal')
    ax.scatter(a['timestamp'],a['value'], color='red', label = 'Anomaly')
    plt.legend()
    plt.show();


    print("Percentage of anomalies in data: {:.2f}".format((len(df.loc[df['anomaly']==1])/len(df))*100))
    df['scores'].hist()
    plt.show();


    #IQR-based  
    lower_bound,upper_bound=iqr_bounds(df['scores'],k=2)
    df['anomaly']=0
    df['anomaly']=(df['scores'] < lower_bound) |(df['scores'] > upper_bound)
    df['anomaly']=df['anomaly'].astype(int)


    fig, ax = plt.subplots(figsize=(10,6))
    a = df.loc[df['anomaly'] == 1, ['timestamp', 'value']] #anomaly
    ax.plot(df['timestamp'], df['value'], color='blue', label = 'Normal')
    ax.scatter(a['timestamp'],a['value'], color='red', label = 'Anomaly')
    plt.title('IQR-based');
    plt.legend()
    plt.show();
    print("Percentage of anomalies in data: {:.2f}".format((len(df.loc[df['anomaly']==1])/len(df))*100))
    
full_df = load_data()
df = filter_data(full_df)
df = view_anomalies(df)
validate_model(full_df)

import os
import pandas as pd
from datetime import datetime


def load_data():
    dirname = os.path.dirname(os.path.realpath(__file__))
    return pd.read_csv(dirname + '/data/ec2_cpu_utilization_5f5533.csv')


def filter_data(df):
    start_date = datetime.strptime('14-02-24 00:00:00', '%y-%m-%d %H:%M:%S')
    end_date = datetime.strptime('14-02-24 23:59:59', '%y-%m-%d %H:%M:%S')
    df['timestamp_str'] = df['timestamp']
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df=df.loc[df['timestamp'] > start_date ]
    df=df.loc[df['timestamp'] < end_date ]
    return df

知乎学术咨询：

https://www.zhihu.com/consult/people/792359672131756032?isMe=1

担任《Mechanical System and Signal Processing》等审稿专家，擅长领域：信号滤波/降噪，机器学习/深度学习，时间序列预分析/预测，设备故障诊断/缺陷检测/异常检测。

分割线

基于小波分析的Linear电磁谱降噪(Python)

完整代码：mbd.pub/o/bread/ZZ2alZdv

哥廷根数学学派

关注

8
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录