基于DBSCAN的多元时间序列异常检测(Python)

哥廷根数学学派

于 2024-09-02 12:50:27 发布

阅读量382

点赞数 5

分类专栏：信号处理机器学习深度学习文章标签： python 开发语言机器学习人工智能深度学习

本文链接：https://blog.csdn.net/weixin_39402231/article/details/141816960

版权

信号处理同时被 3 个专栏收录

347 篇文章 29 订阅

订阅专栏

机器学习

191 篇文章 1 订阅

订阅专栏

深度学习

152 篇文章 5 订阅

订阅专栏

import pandas as pd
import numpy as np
import mylibrary as lib
import plotly.express as px
import plotly.graph_objs as go
import plotly.io as pio
import random
import copy
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
df, filename = lib.preprocess.upload.upload_csv()
df['DateTime'] = pd.to_datetime(df['DateTime'])
# df = df.drop('Hid_Pomp_Motor_Rulman', axis = 1)
df = df.dropna()
dt1 = pd.to_datetime('2023-07-14 00:00:04')
dt2 = pd.to_datetime('2023-07-14 23:59:04')
df['DateTime'] = pd.to_datetime(df['DateTime'])
df = df[(df['DateTime'] >= dt1) & (df['DateTime'] <= dt2)]
df1 = df.copy() 
df1 = lib.preprocess.scale.apply_min_max_scale(df1)
df_resampled = df1.resample('2s', on='DateTime').mean().reset_index()
def smoothing(x, k):
    width = 1 / k


    for i in range(k):
        min_val = i * width
        max_val = (i + 1) * width


        if min_val <= x <= max_val:
            return min_val


    return "Invalid value"


def apply_smoothing_to_dataframe(df, k):
    result_df = df.copy()


    for column in result_df.columns:
        if column != 'DateTime':
            result_df[f'{column}_binned'] = result_df[column].apply(lambda x: smoothing(x, k))
            del result_df[column] 


    return result_df
k = 200
df_resampled = apply_smoothing_to_dataframe(df_resampled, k)
sensors = df_resampled.columns[1:]
sensors
Index(['Hid_Pomp_Balans_binned', 'Hid_Pomp_Motor_Rulman_binned',
       'Hid_emn_bas_act_binned'],
      dtype='object')
df1 = df_resampled.copy()
df1 = df1.dropna()

Fucntions

# anomaly_set is list of anomaly points set(1d)
# outliers_set is list of predicted points sets (1d)
# anomaly_points is a list of waves
def intersection_multi(anomaly_set, outliers_set, anomaly_points, threshold):
    union_set = set()
    anomaly_points = {tuple(arr) for arr in anomaly_points}
    for elem1 in anomaly_set:
        for arr in anomaly_points:
            if any(abs(elem1 - elem2) < threshold for elem2 in outliers_set) and elem1 in arr:
                union_set.add(arr)
                break
    return union_set
# anomaly_points is [[],[],[]] matrix
# outliers is matrix
def get_metrics_multi(anomaly_points, outliers, threshold):
    
    flat_anomaly_points = [point for arr in anomaly_points for point in arr]
    anomaly_set = set(flat_anomaly_points)
    outliers_set = set(outliers)
    
    intersection = (intersection_multi(anomaly_set, outliers_set, anomaly_points, threshold))
    tp = sum(len(t) for t in intersection)


    print(len(outliers_set))
    print(len(anomaly_set))
    print(tp)
    if len(outliers_set) == 0:
        precision = 0
    else:
        precision = tp/len(outliers_set)
    recall = tp/len(anomaly_set)


    return precision, recall
def get_results(df, noise, anomaly_count, anomaly_normal_ratio, wave_length, nsensors, df_results, eps, min_samples):
    df2 = df.copy()
    df2, anomaly_points2 = lib.preprocess.anomaly.apply_anomaly(df2, noise = noise, anomaly_normal_ratio = anomaly_normal_ratio, noise_direction = 'n', random_state = 300, wave = True, wave_length = wave_length, nsensors = nsensors)
    # lib.preprocess.anomaly.plot_anomaly(df2, anomaly_points2)
    
    anomaly_points2['Hid_emn_bas_act_binned'].append([42549, 42548, 42441, 42457, 42547, 42442, 42456, 42527, 42443,
                                    42546, 42455, 42545, 42444, 42544, 42528, 42543, 42542, 42541,
                                    42540, 42539, 42538, 42529, 42537, 42536, 42454, 42535, 42530,
                                    42534, 42445, 42533, 42531, 42532, 42453, 42446, 42452, 42451,
                                    42447, 42450, 42449, 42448])


    anomaly_points2 = anomaly_points2[next(iter(anomaly_points2))]




    scaler = StandardScaler()
    scaled_data = scaler.fit_transform(df2[df2.columns[1:]])
    dbscan = DBSCAN(eps=eps, min_samples=min_samples) 
    df2['Anomaly_DBSCAN'] = dbscan.fit_predict(scaled_data)
    anomalies_dbscan = df2[df2['Anomaly_DBSCAN'] == -1]
    anomaly_indexes = anomalies_dbscan.index.tolist()
  
       
    threshold = wave_length * 2  if wave_length * 2 > 20 else 20
    precision, recall = get_metrics_multi(anomaly_points2, anomaly_indexes, threshold)


    row = {'data': 77, 'noise': noise, 'nsensors': nsensors, 'anomaly count': anomaly_count,  
    'wave len': wave_length, 'precision' : precision, 'recall' : recall, 
    'anomaly_normal_ratio' : anomaly_normal_ratio, 'noise direction' : 'n', 'eps' : eps, 'min_samples' : min_samples}
    
    df_results = pd.concat([df_results, pd.DataFrame([row])], ignore_index=True)
    return df_results

DBSCAN

df_results = pd.DataFrame(columns=['data', 'noise', 'nsensors', 'anomaly count',  'wave len',  'precision', 'recall', 'anomaly_normal_ratio', 'noise direction', 'eps', 'min_samples'])
#fine tunnning


df_results2 = df_results.copy()
eps_arr = [i*0.1 for i in range(1,11)]
min_samples_arr = [i for i in range(2,8)]


for eps in eps_arr:
    for min_samples in min_samples_arr:
        df_results2 = get_results(df1, 0.25, 5, 0.00013, 15, 2, df_results2, eps, min_samples)

df_results2.sort_values(['recall', 'precision'], ascending=[False, False])

	data	noise	nsensors	anomaly count	wave len	precision	recall	anomaly_normal_ratio	noise direction	eps	min_samples
14	77	0.25	2	5	15	0.189456	1.000000	0.00013	n	0.3	4
15	77	0.25	2	5	15	0.152722	1.000000	0.00013	n	0.3	5
7	77	0.25	2	5	15	0.150721	1.000000	0.00013	n	0.2	3
16	77	0.25	2	5	15	0.132948	1.000000	0.00013	n	0.3	6
8	77	0.25	2	5	15	0.119295	1.000000	0.00013	n	0.2	4
0	77	0.25	2	5	15	0.116162	1.000000	0.00013	n	0.1	2
17	77	0.25	2	5	15	0.112414	1.000000	0.00013	n	0.3	7
9	77	0.25	2	5	15	0.095594	1.000000	0.00013	n	0.2	5
10	77	0.25	2	5	15	0.081100	1.000000	0.00013	n	0.2	6
11	77	0.25	2	5	15	0.073248	1.000000	0.00013	n	0.2	7
1	77	0.25	2	5	15	0.067488	1.000000	0.00013	n	0.1	3
2	77	0.25	2	5	15	0.051271	1.000000	0.00013	n	0.1	4
3	77	0.25	2	5	15	0.041910	1.000000	0.00013	n	0.1	5
4	77	0.25	2	5	15	0.036073	1.000000	0.00013	n	0.1	6
5	77	0.25	2	5	15	0.032643	1.000000	0.00013	n	0.1	7
12	77	0.25	2	5	15	0.362319	0.869565	0.00013	n	0.3	2
20	77	0.25	2	5	15	0.319489	0.869565	0.00013	n	0.4	4
21	77	0.25	2	5	15	0.252525	0.869565	0.00013	n	0.4	5
13	77	0.25	2	5	15	0.221239	0.869565	0.00013	n	0.3	3
6	77	0.25	2	5	15	0.220751	0.869565	0.00013	n	0.2	2
22	77	0.25	2	5	15	0.214592	0.869565	0.00013	n	0.4	6
23	77	0.25	2	5	15	0.184502	0.869565	0.00013	n	0.4	7
30	77	0.25	2	5	15	2.741935	0.739130	0.00013	n	0.6	2
31	77	0.25	2	5	15	2.073171	0.739130	0.00013	n	0.6	3
32	77	0.25	2	5	15	1.700000	0.739130	0.00013	n	0.6	4
24	77	0.25	2	5	15	1.440678	0.739130	0.00013	n	0.5	2
33	77	0.25	2	5	15	1.180556	0.739130	0.00013	n	0.6	5
25	77	0.25	2	5	15	0.894737	0.739130	0.00013	n	0.5	3
34	77	0.25	2	5	15	0.867347	0.739130	0.00013	n	0.6	6
35	77	0.25	2	5	15	0.739130	0.739130	0.00013	n	0.6	7
26	77	0.25	2	5	15	0.702479	0.739130	0.00013	n	0.5	4
18	77	0.25	2	5	15	0.570470	0.739130	0.00013	n	0.4	2
27	77	0.25	2	5	15	0.531250	0.739130	0.00013	n	0.5	5
28	77	0.25	2	5	15	0.431472	0.739130	0.00013	n	0.5	6
29	77	0.25	2	5	15	0.349794	0.739130	0.00013	n	0.5	7
19	77	0.25	2	5	15	0.344130	0.739130	0.00013	n	0.4	3
36	77	0.25	2	5	15	2.916667	0.608696	0.00013	n	0.7	2
37	77	0.25	2	5	15	2.333333	0.608696	0.00013	n	0.7	3
38	77	0.25	2	5	15	2.000000	0.608696	0.00013	n	0.7	4
46	77	0.25	2	5	15	1.794872	0.608696	0.00013	n	0.8	6
39	77	0.25	2	5	15	1.666667	0.608696	0.00013	n	0.7	5
47	77	0.25	2	5	15	1.521739	0.608696	0.00013	n	0.8	7
40	77	0.25	2	5	15	1.250000	0.608696	0.00013	n	0.7	6
41	77	0.25	2	5	15	0.897436	0.608696	0.00013	n	0.7	7
42	77	0.25	2	5	15	4.230769	0.478261	0.00013	n	0.8	2
43	77	0.25	2	5	15	2.894737	0.478261	0.00013	n	0.8	3
44	77	0.25	2	5	15	2.750000	0.478261	0.00013	n	0.8	4
45	77	0.25	2	5	15	2.391304	0.478261	0.00013	n	0.8	5
48	77	0.25	2	5	15	5.000000	0.347826	0.00013	n	0.9	2
54	77	0.25	2	5	15	5.000000	0.347826	0.00013	n	1.0	2
49	77	0.25	2	5	15	3.333333	0.347826	0.00013	n	0.9	3
55	77	0.25	2	5	15	3.333333	0.347826	0.00013	n	1.0	3
50	77	0.25	2	5	15	3.076923	0.347826	0.00013	n	0.9	4
56	77	0.25	2	5	15	3.076923	0.347826	0.00013	n	1.0	4
57	77	0.25	2	5	15	3.076923	0.347826	0.00013	n	1.0	5
58	77	0.25	2	5	15	2.857143	0.347826	0.00013	n	1.0	6
51	77	0.25	2	5	15	2.666667	0.347826	0.00013	n	0.9	5
52	77	0.25	2	5	15	2.666667	0.347826	0.00013	n	0.9	6
59	77	0.25	2	5	15	2.666667	0.347826	0.00013	n	1.0	7
53	77	0.25	2	5	15	2.105263	0.347826	0.00013	n	0.9	7

df_results2_pl = df_results2[['precision', 'recall', 'eps', 'min_samples']]
df_results2_pl.loc[df_results2_pl["precision"] > 1, "precision"] = 1
df_results2_pl.sort_values(['precision', 'recall'], ascending=[False, False])

	precision	recall	eps	min_samples
24	1.000000	0.739130	0.5	2
30	1.000000	0.739130	0.6	2
31	1.000000	0.739130	0.6	3
32	1.000000	0.739130	0.6	4
33	1.000000	0.739130	0.6	5
36	1.000000	0.608696	0.7	2
37	1.000000	0.608696	0.7	3
38	1.000000	0.608696	0.7	4
39	1.000000	0.608696	0.7	5
40	1.000000	0.608696	0.7	6
46	1.000000	0.608696	0.8	6
47	1.000000	0.608696	0.8	7
42	1.000000	0.478261	0.8	2
43	1.000000	0.478261	0.8	3
44	1.000000	0.478261	0.8	4
45	1.000000	0.478261	0.8	5
48	1.000000	0.347826	0.9	2
49	1.000000	0.347826	0.9	3
50	1.000000	0.347826	0.9	4
51	1.000000	0.347826	0.9	5
52	1.000000	0.347826	0.9	6
53	1.000000	0.347826	0.9	7
54	1.000000	0.347826	1.0	2
55	1.000000	0.347826	1.0	3
56	1.000000	0.347826	1.0	4
57	1.000000	0.347826	1.0	5
58	1.000000	0.347826	1.0	6
59	1.000000	0.347826	1.0	7
41	0.897436	0.608696	0.7	7
25	0.894737	0.739130	0.5	3
34	0.867347	0.739130	0.6	6
35	0.739130	0.739130	0.6	7
26	0.702479	0.739130	0.5	4
18	0.570470	0.739130	0.4	2
27	0.531250	0.739130	0.5	5
28	0.431472	0.739130	0.5	6
12	0.362319	0.869565	0.3	2
29	0.349794	0.739130	0.5	7
19	0.344130	0.739130	0.4	3
20	0.319489	0.869565	0.4	4
21	0.252525	0.869565	0.4	5
13	0.221239	0.869565	0.3	3
6	0.220751	0.869565	0.2	2
22	0.214592	0.869565	0.4	6
14	0.189456	1.000000	0.3	4
23	0.184502	0.869565	0.4	7
15	0.152722	1.000000	0.3	5
7	0.150721	1.000000	0.2	3
16	0.132948	1.000000	0.3	6
8	0.119295	1.000000	0.2	4
0	0.116162	1.000000	0.1	2
17	0.112414	1.000000	0.3	7
9	0.095594	1.000000	0.2	5
10	0.081100	1.000000	0.2	6
11	0.073248	1.000000	0.2	7
1	0.067488	1.000000	0.1	3
2	0.051271	1.000000	0.1	4
3	0.041910	1.000000	0.1	5
4	0.036073	1.000000	0.1	6
5	0.032643	1.000000	0.1	7

wave_lengths = [1, 15, 60]
noises = [0.5, 0.25, 0.1]
anomaly_counts = [1, 5, 10]
anomaly_normal_ratios = {1: 0.000025, 5: 0.00013, 10: 0.00024}
nsensors = [1, 2, 3]
for wave_length in wave_lengths:
    for noise in noises:
        for sensors in nsensors:
            for anomaly_count in anomaly_counts:
                anomaly_normal_ratio = anomaly_normal_ratios[anomaly_count]
                df_results = get_results(df1,  noise,  anomaly_count, anomaly_normal_ratio, wave_length, sensors, df_results, 0.5, 2)
df_results_pl = df_results.copy()
df_results_pl.loc[df_results_pl["precision"] > 1, "precision"] = 1
df_results_pl.sort_values(['precision', 'recall'], ascending=[False, False])

df_results_pl.to_csv("dbscan_multi_res.csv", index = False)
df_results_pl['precision'].mean()
0.8851496342284881
df_results_pl['recall'].mean()
0.8140926959487169
df_results3 = pd.DataFrame(columns=['data', 'noise', 'nsensors', 'anomaly count',  'wave len',  'precision', 'recall', 'anomaly_normal_ratio', 'noise direction', 'eps', 'min_samples'])
for wave_length in wave_lengths:
    for noise in noises:
        for sensors in nsensors:
            for anomaly_count in anomaly_counts:
                anomaly_normal_ratio = anomaly_normal_ratios[anomaly_count]
                df_results3 = get_results(df1,  noise,  anomaly_count, anomaly_normal_ratio, wave_length, sensors, df_results3, 0.6, 5)
df_results_pl3 = df_results3.copy()
df_results_pl3.loc[df_results_pl3["precision"] > 1, "precision"] = 1
df_results_pl3.sort_values(['precision', 'recall'], ascending=[False, False])

df_results_pl3['precision'].mean()
0.8307340734332637
df_results_pl3['recall'].mean()
0.8069424502574994

Test

df2 = df1.copy()
df2, anomaly_points2 = lib.preprocess.anomaly.apply_anomaly(df2, noise = 0.5, anomaly_normal_ratio = 0.00013, noise_direction = 'n', random_state = 300, wave = True, wave_length = 15, nsensors = 3)
lib.preprocess.anomaly.plot_anomaly(df2, anomaly_points2)


anomaly_points2['Hid_emn_bas_act_binned'].append([42549, 42548, 42441, 42457, 42547, 42442, 42456, 42527, 42443,
                                42546, 42455, 42545, 42444, 42544, 42528, 42543, 42542, 42541,
                                42540, 42539, 42538, 42529, 42537, 42536, 42454, 42535, 42530,
                                42534, 42445, 42533, 42531, 42532, 42453, 42446, 42452, 42451,
                                42447, 42450, 42449, 42448])

Hid_emn_bas_act_binned
5 ANOMALIES
Hid_Pomp_Motor_Rulman_binned
5 ANOMALIES
Hid_Pomp_Balans_binned
5 ANOMALIES

知乎学术咨询：https://www.zhihu.com/consult/people/792359672131756032?isMe=1

担任《Mechanical System and Signal Processing》等审稿专家，擅长领域：信号滤波/降噪，机器学习/深度学习，时间序列预分析/预测，设备故障诊断/缺陷检测/异常检测。

分割线

基于自编码器半监督机器学习技术的时间序列异常检测-传感器数据（三缸泵输出压力）为例（MATLAB R2021B）

完整代码可通过知乎学术咨询获得：

https://www.zhihu.com/consult/people/792359672131756032?isMe=1

哥廷根数学学派

关注

5
点赞
踩
11

收藏

觉得还不错? 一键收藏
打赏
0
评论
基于DBSCAN的多元时间序列异常检测(Python)

基于DBSCAN的多元时间序列异常检测(Python)工学博士，担任《Mechanical System and Signal Processing》等期刊审稿专家。擅长领域：设备故障诊断/缺陷检测/异常检测，信号滤波/降噪，机器学习/深度学习，时间序列预分析/预测，。
复制链接

扫一扫