简单的基于孤立森林的时间序列数据异常检测(Python)

194 篇文章 1 订阅
155 篇文章 5 订阅
import matplotlib.pyplot as plt
import pandas as pd
from dataset import load_data, filter_data
from datetime import datetime
from sklearn.ensemble import IsolationForest


def iqr_bounds(scores,k=1.5):
    q1 = scores.quantile(0.25)
    q3 = scores.quantile(0.75)
    iqr = q3 - q1
    lower_bound=(q1 - k * iqr)
    upper_bound=(q3 + k * iqr)
    print("Lower bound:{} \nUpper bound:{}".format(lower_bound,upper_bound))
    return lower_bound,upper_bound


def view_anomalies(df):


    #Fixed contamination value
    clf=IsolationForest(n_estimators=10, max_samples='auto', contamination=float(.04), \
                            max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
    clf.fit(df[['value']])
    df['scores']=clf.decision_function(df[['value']])
    df['anomaly']=clf.predict(df[['value']])
    df.head()
    df.loc[df['anomaly'] == 1,'anomaly'] = 0
    df.loc[df['anomaly'] == -1,'anomaly'] = -1
    print(df['anomaly'].value_counts())


    # visualization
    fig, ax = plt.subplots(figsize=(10,6))
    a = df.loc[df['anomaly'] == -1, ['timestamp', 'value']] #anomaly
    ax.plot(df['timestamp'], df['value'], color='blue', label = 'Normal')
    ax.scatter(a['timestamp'],a['value'], color='red', label = 'Anomaly')
    plt.legend()
    plt.show();


    df['scores'].hist()
    plt.show();


    #IQR-based  
    print()
    lower_bound,upper_bound=iqr_bounds(df['scores'],k=2)


    df['anomaly']=0
    df['anomaly']=(df['scores'] < lower_bound) |(df['scores'] > upper_bound)
    df['anomaly']=df['anomaly'].astype(int)
    fig, ax = plt.subplots(figsize=(10,6))
    a = df.loc[df['anomaly'] == 1, ['timestamp', 'value']] #anomaly
    ax.plot(df['timestamp'], df['value'], color='blue', label = 'Normal')
    ax.scatter(a['timestamp'],a['value'], color='red', label = 'Anomaly')
    plt.title('IQR-based');
    plt.legend()
    plt.show();


    print("Percentage of anomalies in data: {:.2f}".format((len(df.loc[df['anomaly']==1])/len(df))*100))
    return df


def validate_model(full_df):
    start_date = datetime.strptime('14-02-17 00:00:00', '%y-%m-%d %H:%M:%S')
    end_date = datetime.strptime('14-02-17 23:59:59', '%y-%m-%d %H:%M:%S')


    df=full_df.loc[ (full_df['timestamp'] > start_date) & (full_df['timestamp'] < end_date) ]
    # Using graph_objects
    df.plot(x='timestamp', y='value', figsize=(12,6))
    plt.xlabel('Date time')
    plt.ylabel('CPU Utilization')
    plt.title('Distribution of Validation Data');
    plt.show();


    #Fixed contamination value
    clf=IsolationForest(n_estimators=10, max_samples='auto', contamination=float(.04), \
                            max_features=1.0, bootstrap=False, n_jobs=-1, random_state=42, verbose=0)
    clf.fit(df[['value']])
    df['scores']=clf.decision_function(df[['value']])
    df['anomaly']=clf.predict(df[['value']])
    df.loc[df['anomaly'] == 1,'anomaly'] = 0
    df.loc[df['anomaly'] == -1,'anomaly'] = -1
    
    fig, ax = plt.subplots(figsize=(10,6))
    a = df.loc[df['anomaly'] == -1, ['timestamp', 'value']] #anomaly
    ax.plot(df['timestamp'], df['value'], color='blue', label = 'Normal')
    ax.scatter(a['timestamp'],a['value'], color='red', label = 'Anomaly')
    plt.legend()
    plt.show();


    print("Percentage of anomalies in data: {:.2f}".format((len(df.loc[df['anomaly']==1])/len(df))*100))
    df['scores'].hist()
    plt.show();


    #IQR-based  
    lower_bound,upper_bound=iqr_bounds(df['scores'],k=2)
    df['anomaly']=0
    df['anomaly']=(df['scores'] < lower_bound) |(df['scores'] > upper_bound)
    df['anomaly']=df['anomaly'].astype(int)


    fig, ax = plt.subplots(figsize=(10,6))
    a = df.loc[df['anomaly'] == 1, ['timestamp', 'value']] #anomaly
    ax.plot(df['timestamp'], df['value'], color='blue', label = 'Normal')
    ax.scatter(a['timestamp'],a['value'], color='red', label = 'Anomaly')
    plt.title('IQR-based');
    plt.legend()
    plt.show();
    print("Percentage of anomalies in data: {:.2f}".format((len(df.loc[df['anomaly']==1])/len(df))*100))
    
full_df = load_data()
df = filter_data(full_df)
df = view_anomalies(df)
validate_model(full_df)

import os
import pandas as pd
from datetime import datetime


def load_data():
    dirname = os.path.dirname(os.path.realpath(__file__))
    return pd.read_csv(dirname + '/data/ec2_cpu_utilization_5f5533.csv')


def filter_data(df):
    start_date = datetime.strptime('14-02-24 00:00:00', '%y-%m-%d %H:%M:%S')
    end_date = datetime.strptime('14-02-24 23:59:59', '%y-%m-%d %H:%M:%S')
    df['timestamp_str'] = df['timestamp']
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df=df.loc[df['timestamp'] > start_date ]
    df=df.loc[df['timestamp'] < end_date ]
    return df



知乎学术咨询:

https://www.zhihu.com/consult/people/792359672131756032?isMe=1

担任《Mechanical System and Signal Processing》等审稿专家,擅长领域:信号滤波/降噪,机器学习/深度学习,时间序列预分析/预测,设备故障诊断/缺陷检测/异常检测。

分割线

基于小波分析的Linear电磁谱降噪(Python)

完整代码:mbd.pub/o/bread/ZZ2alZdv

好的,以下是一个基于随机森林的时间序列异常检测Python代码示例,希望能对您有所帮助: ```python import pandas as pd import numpy as np from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error # 数据预处理 data = pd.read_csv('data.csv', header=0) data['timestamp'] = pd.to_datetime(data['timestamp']) data = data.set_index('timestamp') data = data.resample('H').mean().interpolate() # 特征工程 data['lag1'] = data['value'].shift(1) data['lag2'] = data['value'].shift(2) data['lag3'] = data['value'].shift(3) data['rolling_mean'] = data['value'].rolling(window=3).mean() data['rolling_std'] = data['value'].rolling(window=3).std() # 将数据集分为训练集和测试集 train_size = int(len(data) * 0.7) train, test = data[0:train_size], data[train_size:len(data)] # 定义随机森林模型 model = RandomForestRegressor(n_estimators=1000, random_state=42) # 拟合模型并进行预测 model.fit(train[['lag1', 'lag2', 'lag3', 'rolling_mean', 'rolling_std']], train['value']) predictions = model.predict(test[['lag1', 'lag2', 'lag3', 'rolling_mean', 'rolling_std']]) # 计算均方误差 mse = mean_squared_error(test['value'], predictions) print('Mean Squared Error: ', mse) # 检测异常值 threshold = np.mean(mse) + 3 * np.std(mse) anomalies = [] for i in range(len(predictions)): if predictions[i] > threshold: anomalies.append(i) print('Anomalies: ', anomalies) ``` 这个示例中,我们首先进行了数据预处理,然后进行了特征工程。接下来,我们将数据集分为训练集和测试集,定义了一个随机森林模型并进行了拟合和预测。最后,我们计算了均方误差并根据阈值检测了异常值。 请注意,这只是一个简单的示例代码,实际应用可能需要更多的特征工程和模型调整。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

哥廷根数学学派

码字不易,且行且珍惜

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值