基于DBSCAN的多元时间序列异常检测(Python)

191 篇文章 1 订阅
152 篇文章 5 订阅
import pandas as pd
import numpy as np
import mylibrary as lib
import plotly.express as px
import plotly.graph_objs as go
import plotly.io as pio
import random
import copy
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
df, filename = lib.preprocess.upload.upload_csv()
df['DateTime'] = pd.to_datetime(df['DateTime'])
# df = df.drop('Hid_Pomp_Motor_Rulman', axis = 1)
df = df.dropna()
dt1 = pd.to_datetime('2023-07-14 00:00:04')
dt2 = pd.to_datetime('2023-07-14 23:59:04')
df['DateTime'] = pd.to_datetime(df['DateTime'])
df = df[(df['DateTime'] >= dt1) & (df['DateTime'] <= dt2)]
df1 = df.copy() 
df1 = lib.preprocess.scale.apply_min_max_scale(df1)
df_resampled = df1.resample('2s', on='DateTime').mean().reset_index()
def smoothing(x, k):
    width = 1 / k


    for i in range(k):
        min_val = i * width
        max_val = (i + 1) * width


        if min_val <= x <= max_val:
            return min_val


    return "Invalid value"


def apply_smoothing_to_dataframe(df, k):
    result_df = df.copy()


    for column in result_df.columns:
        if column != 'DateTime':
            result_df[f'{column}_binned'] = result_df[column].apply(lambda x: smoothing(x, k))
            del result_df[column] 


    return result_df
k = 200
df_resampled = apply_smoothing_to_dataframe(df_resampled, k)
sensors = df_resampled.columns[1:]
sensors
Index(['Hid_Pomp_Balans_binned', 'Hid_Pomp_Motor_Rulman_binned',
       'Hid_emn_bas_act_binned'],
      dtype='object')
df1 = df_resampled.copy()
df1 = df1.dropna()

Fucntions

# anomaly_set is list of anomaly points set(1d)
# outliers_set is list of predicted points sets (1d)
# anomaly_points is a list of waves
def intersection_multi(anomaly_set, outliers_set, anomaly_points, threshold):
    union_set = set()
    anomaly_points = {tuple(arr) for arr in anomaly_points}
    for elem1 in anomaly_set:
        for arr in anomaly_points:
            if any(abs(elem1 - elem2) < threshold for elem2 in outliers_set) and elem1 in arr:
                union_set.add(arr)
                break
    return union_set
# anomaly_points is [[],[],[]] matrix
# outliers is matrix
def get_metrics_multi(anomaly_points, outliers, threshold):
    
    flat_anomaly_points = [point for arr in anomaly_points for point in arr]
    anomaly_set = set(flat_anomaly_points)
    outliers_set = set(outliers)
    
    intersection = (intersection_multi(anomaly_set, outliers_set, anomaly_points, threshold))
    tp = sum(len(t) for t in intersection)


    print(len(outliers_set))
    print(len(anomaly_set))
    print(tp)
    if len(outliers_set) == 0:
        precision = 0
    else:
        precision = tp/len(outliers_set)
    recall = tp/len(anomaly_set)


    return precision, recall
def get_results(df, noise, anomaly_count, anomaly_normal_ratio, wave_length, nsensors, df_results, eps, min_samples):
    df2 = df.copy()
    df2, anomaly_points2 = lib.preprocess.anomaly.apply_anomaly(df2, noise = noise, anomaly_normal_ratio = anomaly_normal_ratio, noise_direction = 'n', random_state = 300, wave = True, wave_length = wave_length, nsensors = nsensors)
    # lib.preprocess.anomaly.plot_anomaly(df2, anomaly_points2)
    
    anomaly_points2['Hid_emn_bas_act_binned'].append([42549, 42548, 42441, 42457, 42547, 42442, 42456, 42527, 42443,
                                    42546, 42455, 42545, 42444, 42544, 42528, 42543, 42542, 42541,
                                    42540, 42539, 42538, 42529, 42537, 42536, 42454, 42535, 42530,
                                    42534, 42445, 42533, 42531, 42532, 42453, 42446, 42452, 42451,
                                    42447, 42450, 42449, 42448])


    anomaly_points2 = anomaly_points2[next(iter(anomaly_points2))]




    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df2[df2.columns[1:]])
    dbscan = DBSCAN(eps=eps, min_samples=min_samples) 
    df2['Anomaly_DBSCAN'] = dbscan.fit_predict(scaled_data)
    anomalies_dbscan = df2[df2['Anomaly_DBSCAN'] == -1]
    anomaly_indexes = anomalies_dbscan.index.tolist()
  
       
    threshold = wave_length * 2  if wave_length * 2 > 20 else 20
    precision, recall = get_metrics_multi(anomaly_points2, anomaly_indexes, threshold)


    row = {'data': 77, 'noise': noise, 'nsensors': nsensors, 'anomaly count': anomaly_count,  
    'wave len': wave_length, 'precision' : precision, 'recall' : recall, 
    'anomaly_normal_ratio' : anomaly_normal_ratio, 'noise direction' : 'n', 'eps' : eps, 'min_samples' : min_samples}
    
    df_results = pd.concat([df_results, pd.DataFrame([row])], ignore_index=True)
    return df_results

DBSCAN

df_results = pd.DataFrame(columns=['data', 'noise', 'nsensors', 'anomaly count',  'wave len',  'precision', 'recall', 'anomaly_normal_ratio', 'noise direction', 'eps', 'min_samples'])
#fine tunnning


df_results2 = df_results.copy()
eps_arr = [i*0.1 for i in range(1,11)]
min_samples_arr = [i for i in range(2,8)]


for eps in eps_arr:
    for min_samples in min_samples_arr:
        df_results2 = get_results(df1, 0.25, 5, 0.00013, 15, 2, df_results2, eps, min_samples)

df_results2.sort_values(['recall', 'precision'], ascending=[False, False])
datanoisensensorsanomaly countwave lenprecisionrecallanomaly_normal_rationoise directionepsmin_samples
14770.2525150.1894561.0000000.00013n0.34
15770.2525150.1527221.0000000.00013n0.35
7770.2525150.1507211.0000000.00013n0.23
16770.2525150.1329481.0000000.00013n0.36
8770.2525150.1192951.0000000.00013n0.24
0770.2525150.1161621.0000000.00013n0.12
17770.2525150.1124141.0000000.00013n0.37
9770.2525150.0955941.0000000.00013n0.25
10770.2525150.0811001.0000000.00013n0.26
11770.2525150.0732481.0000000.00013n0.27
1770.2525150.0674881.0000000.00013n0.13
2770.2525150.0512711.0000000.00013n0.14
3770.2525150.0419101.0000000.00013n0.15
4770.2525150.0360731.0000000.00013n0.16
5770.2525150.0326431.0000000.00013n0.17
12770.2525150.3623190.8695650.00013n0.32
20770.2525150.3194890.8695650.00013n0.44
21770.2525150.2525250.8695650.00013n0.45
13770.2525150.2212390.8695650.00013n0.33
6770.2525150.2207510.8695650.00013n0.22
22770.2525150.2145920.8695650.00013n0.46
23770.2525150.1845020.8695650.00013n0.47
30770.2525152.7419350.7391300.00013n0.62
31770.2525152.0731710.7391300.00013n0.63
32770.2525151.7000000.7391300.00013n0.64
24770.2525151.4406780.7391300.00013n0.52
33770.2525151.1805560.7391300.00013n0.65
25770.2525150.8947370.7391300.00013n0.53
34770.2525150.8673470.7391300.00013n0.66
35770.2525150.7391300.7391300.00013n0.67
26770.2525150.7024790.7391300.00013n0.54
18770.2525150.5704700.7391300.00013n0.42
27770.2525150.5312500.7391300.00013n0.55
28770.2525150.4314720.7391300.00013n0.56
29770.2525150.3497940.7391300.00013n0.57
19770.2525150.3441300.7391300.00013n0.43
36770.2525152.9166670.6086960.00013n0.72
37770.2525152.3333330.6086960.00013n0.73
38770.2525152.0000000.6086960.00013n0.74
46770.2525151.7948720.6086960.00013n0.86
39770.2525151.6666670.6086960.00013n0.75
47770.2525151.5217390.6086960.00013n0.87
40770.2525151.2500000.6086960.00013n0.76
41770.2525150.8974360.6086960.00013n0.77
42770.2525154.2307690.4782610.00013n0.82
43770.2525152.8947370.4782610.00013n0.83
44770.2525152.7500000.4782610.00013n0.84
45770.2525152.3913040.4782610.00013n0.85
48770.2525155.0000000.3478260.00013n0.92
54770.2525155.0000000.3478260.00013n1.02
49770.2525153.3333330.3478260.00013n0.93
55770.2525153.3333330.3478260.00013n1.03
50770.2525153.0769230.3478260.00013n0.94
56770.2525153.0769230.3478260.00013n1.04
57770.2525153.0769230.3478260.00013n1.05
58770.2525152.8571430.3478260.00013n1.06
51770.2525152.6666670.3478260.00013n0.95
52770.2525152.6666670.3478260.00013n0.96
59770.2525152.6666670.3478260.00013n1.07
53770.2525152.1052630.3478260.00013n0.97

df_results2_pl = df_results2[['precision', 'recall', 'eps', 'min_samples']]
df_results2_pl.loc[df_results2_pl["precision"] > 1, "precision"] = 1
df_results2_pl.sort_values(['precision', 'recall'], ascending=[False, False])

precisionrecallepsmin_samples
241.0000000.7391300.52
301.0000000.7391300.62
311.0000000.7391300.63
321.0000000.7391300.64
331.0000000.7391300.65
361.0000000.6086960.72
371.0000000.6086960.73
381.0000000.6086960.74
391.0000000.6086960.75
401.0000000.6086960.76
461.0000000.6086960.86
471.0000000.6086960.87
421.0000000.4782610.82
431.0000000.4782610.83
441.0000000.4782610.84
451.0000000.4782610.85
481.0000000.3478260.92
491.0000000.3478260.93
501.0000000.3478260.94
511.0000000.3478260.95
521.0000000.3478260.96
531.0000000.3478260.97
541.0000000.3478261.02
551.0000000.3478261.03
561.0000000.3478261.04
571.0000000.3478261.05
581.0000000.3478261.06
591.0000000.3478261.07
410.8974360.6086960.77
250.8947370.7391300.53
340.8673470.7391300.66
350.7391300.7391300.67
260.7024790.7391300.54
180.5704700.7391300.42
270.5312500.7391300.55
280.4314720.7391300.56
120.3623190.8695650.32
290.3497940.7391300.57
190.3441300.7391300.43
200.3194890.8695650.44
210.2525250.8695650.45
130.2212390.8695650.33
60.2207510.8695650.22
220.2145920.8695650.46
140.1894561.0000000.34
230.1845020.8695650.47
150.1527221.0000000.35
70.1507211.0000000.23
160.1329481.0000000.36
80.1192951.0000000.24
00.1161621.0000000.12
170.1124141.0000000.37
90.0955941.0000000.25
100.0811001.0000000.26
110.0732481.0000000.27
10.0674881.0000000.13
20.0512711.0000000.14
30.0419101.0000000.15
40.0360731.0000000.16
50.0326431.0000000.17

wave_lengths = [1, 15, 60]
noises = [0.5, 0.25, 0.1]
anomaly_counts = [1, 5, 10]
anomaly_normal_ratios = {1: 0.000025, 5: 0.00013, 10: 0.00024}
nsensors = [1, 2, 3]
for wave_length in wave_lengths:
    for noise in noises:
        for sensors in nsensors:
            for anomaly_count in anomaly_counts:
                anomaly_normal_ratio = anomaly_normal_ratios[anomaly_count]
                df_results = get_results(df1,  noise,  anomaly_count, anomaly_normal_ratio, wave_length, sensors, df_results, 0.5, 2)
df_results_pl = df_results.copy()
df_results_pl.loc[df_results_pl["precision"] > 1, "precision"] = 1
df_results_pl.sort_values(['precision', 'recall'], ascending=[False, False])

df_results_pl.to_csv("dbscan_multi_res.csv", index = False)
df_results_pl['precision'].mean()
0.8851496342284881
df_results_pl['recall'].mean()
0.8140926959487169
df_results3 = pd.DataFrame(columns=['data', 'noise', 'nsensors', 'anomaly count',  'wave len',  'precision', 'recall', 'anomaly_normal_ratio', 'noise direction', 'eps', 'min_samples'])
for wave_length in wave_lengths:
    for noise in noises:
        for sensors in nsensors:
            for anomaly_count in anomaly_counts:
                anomaly_normal_ratio = anomaly_normal_ratios[anomaly_count]
                df_results3 = get_results(df1,  noise,  anomaly_count, anomaly_normal_ratio, wave_length, sensors, df_results3, 0.6, 5)
df_results_pl3 = df_results3.copy()
df_results_pl3.loc[df_results_pl3["precision"] > 1, "precision"] = 1
df_results_pl3.sort_values(['precision', 'recall'], ascending=[False, False])

df_results_pl3['precision'].mean()
0.8307340734332637
df_results_pl3['recall'].mean()
0.8069424502574994

Test

df2 = df1.copy()
df2, anomaly_points2 = lib.preprocess.anomaly.apply_anomaly(df2, noise = 0.5, anomaly_normal_ratio = 0.00013, noise_direction = 'n', random_state = 300, wave = True, wave_length = 15, nsensors = 3)
lib.preprocess.anomaly.plot_anomaly(df2, anomaly_points2)


anomaly_points2['Hid_emn_bas_act_binned'].append([42549, 42548, 42441, 42457, 42547, 42442, 42456, 42527, 42443,
                                42546, 42455, 42545, 42444, 42544, 42528, 42543, 42542, 42541,
                                42540, 42539, 42538, 42529, 42537, 42536, 42454, 42535, 42530,
                                42534, 42445, 42533, 42531, 42532, 42453, 42446, 42452, 42451,
                                42447, 42450, 42449, 42448])

Hid_emn_bas_act_binned
5 ANOMALIES
Hid_Pomp_Motor_Rulman_binned
5 ANOMALIES
Hid_Pomp_Balans_binned
5 ANOMALIES

知乎学术咨询:https://www.zhihu.com/consult/people/792359672131756032?isMe=1

担任《Mechanical System and Signal Processing》等审稿专家,擅长领域:信号滤波/降噪,机器学习/深度学习,时间序列预分析/预测,设备故障诊断/缺陷检测/异常检测。

分割线

基于自编码器半监督机器学习技术的时间序列异常检测-传感器数据(三缸泵输出压力)为例(MATLAB R2021B)

完整代码可通过知乎学术咨询获得:

https://www.zhihu.com/consult/people/792359672131756032?isMe=1

  • 5
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

哥廷根数学学派

码字不易,且行且珍惜

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值