import pandas as pd
import numpy as np
import mylibrary as lib
import plotly.express as px
import plotly.graph_objs as go
import plotly.io as pio
import random
import copy
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
df, filename = lib.preprocess.upload.upload_csv()
df['DateTime'] = pd.to_datetime(df['DateTime'])
# df = df.drop('Hid_Pomp_Motor_Rulman', axis = 1)
df = df.dropna()
dt1 = pd.to_datetime('2023-07-14 00:00:04')
dt2 = pd.to_datetime('2023-07-14 23:59:04')
df['DateTime'] = pd.to_datetime(df['DateTime'])
df = df[(df['DateTime'] >= dt1) & (df['DateTime'] <= dt2)]
df1 = df.copy()
df1 = lib.preprocess.scale.apply_min_max_scale(df1)
df_resampled = df1.resample('2s', on='DateTime').mean().reset_index()
def smoothing(x, k):
width = 1 / k
for i in range(k):
min_val = i * width
max_val = (i + 1) * width
if min_val <= x <= max_val:
return min_val
return "Invalid value"
def apply_smoothing_to_dataframe(df, k):
result_df = df.copy()
for column in result_df.columns:
if column != 'DateTime':
result_df[f'{column}_binned'] = result_df[column].apply(lambda x: smoothing(x, k))
del result_df[column]
return result_df
k = 200
df_resampled = apply_smoothing_to_dataframe(df_resampled, k)
sensors = df_resampled.columns[1:]
sensors
Index(['Hid_Pomp_Balans_binned', 'Hid_Pomp_Motor_Rulman_binned',
'Hid_emn_bas_act_binned'],
dtype='object')
df1 = df_resampled.copy()
df1 = df1.dropna()
Fucntions
# anomaly_set is list of anomaly points set(1d)
# outliers_set is list of predicted points sets (1d)
# anomaly_points is a list of waves
def intersection_multi(anomaly_set, outliers_set, anomaly_points, threshold):
union_set = set()
anomaly_points = {tuple(arr) for arr in anomaly_points}
for elem1 in anomaly_set:
for arr in anomaly_points:
if any(abs(elem1 - elem2) < threshold for elem2 in outliers_set) and elem1 in arr:
union_set.add(arr)
break
return union_set
# anomaly_points is [[],[],[]] matrix
# outliers is matrix
def get_metrics_multi(anomaly_points, outliers, threshold):
flat_anomaly_points = [point for arr in anomaly_points for point in arr]
anomaly_set = set(flat_anomaly_points)
outliers_set = set(outliers)
intersection = (intersection_multi(anomaly_set, outliers_set, anomaly_points, threshold))
tp = sum(len(t) for t in intersection)
print(len(outliers_set))
print(len(anomaly_set))
print(tp)
if len(outliers_set) == 0:
precision = 0
else:
precision = tp/len(outliers_set)
recall = tp/len(anomaly_set)
return precision, recall
def get_results(df, noise, anomaly_count, anomaly_normal_ratio, wave_length, nsensors, df_results, eps, min_samples):
df2 = df.copy()
df2, anomaly_points2 = lib.preprocess.anomaly.apply_anomaly(df2, noise = noise, anomaly_normal_ratio = anomaly_normal_ratio, noise_direction = 'n', random_state = 300, wave = True, wave_length = wave_length, nsensors = nsensors)
# lib.preprocess.anomaly.plot_anomaly(df2, anomaly_points2)
anomaly_points2['Hid_emn_bas_act_binned'].append([42549, 42548, 42441, 42457, 42547, 42442, 42456, 42527, 42443,
42546, 42455, 42545, 42444, 42544, 42528, 42543, 42542, 42541,
42540, 42539, 42538, 42529, 42537, 42536, 42454, 42535, 42530,
42534, 42445, 42533, 42531, 42532, 42453, 42446, 42452, 42451,
42447, 42450, 42449, 42448])
anomaly_points2 = anomaly_points2[next(iter(anomaly_points2))]
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df2[df2.columns[1:]])
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
df2['Anomaly_DBSCAN'] = dbscan.fit_predict(scaled_data)
anomalies_dbscan = df2[df2['Anomaly_DBSCAN'] == -1]
anomaly_indexes = anomalies_dbscan.index.tolist()
threshold = wave_length * 2 if wave_length * 2 > 20 else 20
precision, recall = get_metrics_multi(anomaly_points2, anomaly_indexes, threshold)
row = {'data': 77, 'noise': noise, 'nsensors': nsensors, 'anomaly count': anomaly_count,
'wave len': wave_length, 'precision' : precision, 'recall' : recall,
'anomaly_normal_ratio' : anomaly_normal_ratio, 'noise direction' : 'n', 'eps' : eps, 'min_samples' : min_samples}
df_results = pd.concat([df_results, pd.DataFrame([row])], ignore_index=True)
return df_results
DBSCAN
df_results = pd.DataFrame(columns=['data', 'noise', 'nsensors', 'anomaly count', 'wave len', 'precision', 'recall', 'anomaly_normal_ratio', 'noise direction', 'eps', 'min_samples'])
#fine tunnning
df_results2 = df_results.copy()
eps_arr = [i*0.1 for i in range(1,11)]
min_samples_arr = [i for i in range(2,8)]
for eps in eps_arr:
for min_samples in min_samples_arr:
df_results2 = get_results(df1, 0.25, 5, 0.00013, 15, 2, df_results2, eps, min_samples)
df_results2.sort_values(['recall', 'precision'], ascending=[False, False])
data | noise | nsensors | anomaly count | wave len | precision | recall | anomaly_normal_ratio | noise direction | eps | min_samples | |
14 | 77 | 0.25 | 2 | 5 | 15 | 0.189456 | 1.000000 | 0.00013 | n | 0.3 | 4 |
15 | 77 | 0.25 | 2 | 5 | 15 | 0.152722 | 1.000000 | 0.00013 | n | 0.3 | 5 |
7 | 77 | 0.25 | 2 | 5 | 15 | 0.150721 | 1.000000 | 0.00013 | n | 0.2 | 3 |
16 | 77 | 0.25 | 2 | 5 | 15 | 0.132948 | 1.000000 | 0.00013 | n | 0.3 | 6 |
8 | 77 | 0.25 | 2 | 5 | 15 | 0.119295 | 1.000000 | 0.00013 | n | 0.2 | 4 |
0 | 77 | 0.25 | 2 | 5 | 15 | 0.116162 | 1.000000 | 0.00013 | n | 0.1 | 2 |
17 | 77 | 0.25 | 2 | 5 | 15 | 0.112414 | 1.000000 | 0.00013 | n | 0.3 | 7 |
9 | 77 | 0.25 | 2 | 5 | 15 | 0.095594 | 1.000000 | 0.00013 | n | 0.2 | 5 |
10 | 77 | 0.25 | 2 | 5 | 15 | 0.081100 | 1.000000 | 0.00013 | n | 0.2 | 6 |
11 | 77 | 0.25 | 2 | 5 | 15 | 0.073248 | 1.000000 | 0.00013 | n | 0.2 | 7 |
1 | 77 | 0.25 | 2 | 5 | 15 | 0.067488 | 1.000000 | 0.00013 | n | 0.1 | 3 |
2 | 77 | 0.25 | 2 | 5 | 15 | 0.051271 | 1.000000 | 0.00013 | n | 0.1 | 4 |
3 | 77 | 0.25 | 2 | 5 | 15 | 0.041910 | 1.000000 | 0.00013 | n | 0.1 | 5 |
4 | 77 | 0.25 | 2 | 5 | 15 | 0.036073 | 1.000000 | 0.00013 | n | 0.1 | 6 |
5 | 77 | 0.25 | 2 | 5 | 15 | 0.032643 | 1.000000 | 0.00013 | n | 0.1 | 7 |
12 | 77 | 0.25 | 2 | 5 | 15 | 0.362319 | 0.869565 | 0.00013 | n | 0.3 | 2 |
20 | 77 | 0.25 | 2 | 5 | 15 | 0.319489 | 0.869565 | 0.00013 | n | 0.4 | 4 |
21 | 77 | 0.25 | 2 | 5 | 15 | 0.252525 | 0.869565 | 0.00013 | n | 0.4 | 5 |
13 | 77 | 0.25 | 2 | 5 | 15 | 0.221239 | 0.869565 | 0.00013 | n | 0.3 | 3 |
6 | 77 | 0.25 | 2 | 5 | 15 | 0.220751 | 0.869565 | 0.00013 | n | 0.2 | 2 |
22 | 77 | 0.25 | 2 | 5 | 15 | 0.214592 | 0.869565 | 0.00013 | n | 0.4 | 6 |
23 | 77 | 0.25 | 2 | 5 | 15 | 0.184502 | 0.869565 | 0.00013 | n | 0.4 | 7 |
30 | 77 | 0.25 | 2 | 5 | 15 | 2.741935 | 0.739130 | 0.00013 | n | 0.6 | 2 |
31 | 77 | 0.25 | 2 | 5 | 15 | 2.073171 | 0.739130 | 0.00013 | n | 0.6 | 3 |
32 | 77 | 0.25 | 2 | 5 | 15 | 1.700000 | 0.739130 | 0.00013 | n | 0.6 | 4 |
24 | 77 | 0.25 | 2 | 5 | 15 | 1.440678 | 0.739130 | 0.00013 | n | 0.5 | 2 |
33 | 77 | 0.25 | 2 | 5 | 15 | 1.180556 | 0.739130 | 0.00013 | n | 0.6 | 5 |
25 | 77 | 0.25 | 2 | 5 | 15 | 0.894737 | 0.739130 | 0.00013 | n | 0.5 | 3 |
34 | 77 | 0.25 | 2 | 5 | 15 | 0.867347 | 0.739130 | 0.00013 | n | 0.6 | 6 |
35 | 77 | 0.25 | 2 | 5 | 15 | 0.739130 | 0.739130 | 0.00013 | n | 0.6 | 7 |
26 | 77 | 0.25 | 2 | 5 | 15 | 0.702479 | 0.739130 | 0.00013 | n | 0.5 | 4 |
18 | 77 | 0.25 | 2 | 5 | 15 | 0.570470 | 0.739130 | 0.00013 | n | 0.4 | 2 |
27 | 77 | 0.25 | 2 | 5 | 15 | 0.531250 | 0.739130 | 0.00013 | n | 0.5 | 5 |
28 | 77 | 0.25 | 2 | 5 | 15 | 0.431472 | 0.739130 | 0.00013 | n | 0.5 | 6 |
29 | 77 | 0.25 | 2 | 5 | 15 | 0.349794 | 0.739130 | 0.00013 | n | 0.5 | 7 |
19 | 77 | 0.25 | 2 | 5 | 15 | 0.344130 | 0.739130 | 0.00013 | n | 0.4 | 3 |
36 | 77 | 0.25 | 2 | 5 | 15 | 2.916667 | 0.608696 | 0.00013 | n | 0.7 | 2 |
37 | 77 | 0.25 | 2 | 5 | 15 | 2.333333 | 0.608696 | 0.00013 | n | 0.7 | 3 |
38 | 77 | 0.25 | 2 | 5 | 15 | 2.000000 | 0.608696 | 0.00013 | n | 0.7 | 4 |
46 | 77 | 0.25 | 2 | 5 | 15 | 1.794872 | 0.608696 | 0.00013 | n | 0.8 | 6 |
39 | 77 | 0.25 | 2 | 5 | 15 | 1.666667 | 0.608696 | 0.00013 | n | 0.7 | 5 |
47 | 77 | 0.25 | 2 | 5 | 15 | 1.521739 | 0.608696 | 0.00013 | n | 0.8 | 7 |
40 | 77 | 0.25 | 2 | 5 | 15 | 1.250000 | 0.608696 | 0.00013 | n | 0.7 | 6 |
41 | 77 | 0.25 | 2 | 5 | 15 | 0.897436 | 0.608696 | 0.00013 | n | 0.7 | 7 |
42 | 77 | 0.25 | 2 | 5 | 15 | 4.230769 | 0.478261 | 0.00013 | n | 0.8 | 2 |
43 | 77 | 0.25 | 2 | 5 | 15 | 2.894737 | 0.478261 | 0.00013 | n | 0.8 | 3 |
44 | 77 | 0.25 | 2 | 5 | 15 | 2.750000 | 0.478261 | 0.00013 | n | 0.8 | 4 |
45 | 77 | 0.25 | 2 | 5 | 15 | 2.391304 | 0.478261 | 0.00013 | n | 0.8 | 5 |
48 | 77 | 0.25 | 2 | 5 | 15 | 5.000000 | 0.347826 | 0.00013 | n | 0.9 | 2 |
54 | 77 | 0.25 | 2 | 5 | 15 | 5.000000 | 0.347826 | 0.00013 | n | 1.0 | 2 |
49 | 77 | 0.25 | 2 | 5 | 15 | 3.333333 | 0.347826 | 0.00013 | n | 0.9 | 3 |
55 | 77 | 0.25 | 2 | 5 | 15 | 3.333333 | 0.347826 | 0.00013 | n | 1.0 | 3 |
50 | 77 | 0.25 | 2 | 5 | 15 | 3.076923 | 0.347826 | 0.00013 | n | 0.9 | 4 |
56 | 77 | 0.25 | 2 | 5 | 15 | 3.076923 | 0.347826 | 0.00013 | n | 1.0 | 4 |
57 | 77 | 0.25 | 2 | 5 | 15 | 3.076923 | 0.347826 | 0.00013 | n | 1.0 | 5 |
58 | 77 | 0.25 | 2 | 5 | 15 | 2.857143 | 0.347826 | 0.00013 | n | 1.0 | 6 |
51 | 77 | 0.25 | 2 | 5 | 15 | 2.666667 | 0.347826 | 0.00013 | n | 0.9 | 5 |
52 | 77 | 0.25 | 2 | 5 | 15 | 2.666667 | 0.347826 | 0.00013 | n | 0.9 | 6 |
59 | 77 | 0.25 | 2 | 5 | 15 | 2.666667 | 0.347826 | 0.00013 | n | 1.0 | 7 |
53 | 77 | 0.25 | 2 | 5 | 15 | 2.105263 | 0.347826 | 0.00013 | n | 0.9 | 7 |
df_results2_pl = df_results2[['precision', 'recall', 'eps', 'min_samples']]
df_results2_pl.loc[df_results2_pl["precision"] > 1, "precision"] = 1
df_results2_pl.sort_values(['precision', 'recall'], ascending=[False, False])
precision | recall | eps | min_samples | |
24 | 1.000000 | 0.739130 | 0.5 | 2 |
30 | 1.000000 | 0.739130 | 0.6 | 2 |
31 | 1.000000 | 0.739130 | 0.6 | 3 |
32 | 1.000000 | 0.739130 | 0.6 | 4 |
33 | 1.000000 | 0.739130 | 0.6 | 5 |
36 | 1.000000 | 0.608696 | 0.7 | 2 |
37 | 1.000000 | 0.608696 | 0.7 | 3 |
38 | 1.000000 | 0.608696 | 0.7 | 4 |
39 | 1.000000 | 0.608696 | 0.7 | 5 |
40 | 1.000000 | 0.608696 | 0.7 | 6 |
46 | 1.000000 | 0.608696 | 0.8 | 6 |
47 | 1.000000 | 0.608696 | 0.8 | 7 |
42 | 1.000000 | 0.478261 | 0.8 | 2 |
43 | 1.000000 | 0.478261 | 0.8 | 3 |
44 | 1.000000 | 0.478261 | 0.8 | 4 |
45 | 1.000000 | 0.478261 | 0.8 | 5 |
48 | 1.000000 | 0.347826 | 0.9 | 2 |
49 | 1.000000 | 0.347826 | 0.9 | 3 |
50 | 1.000000 | 0.347826 | 0.9 | 4 |
51 | 1.000000 | 0.347826 | 0.9 | 5 |
52 | 1.000000 | 0.347826 | 0.9 | 6 |
53 | 1.000000 | 0.347826 | 0.9 | 7 |
54 | 1.000000 | 0.347826 | 1.0 | 2 |
55 | 1.000000 | 0.347826 | 1.0 | 3 |
56 | 1.000000 | 0.347826 | 1.0 | 4 |
57 | 1.000000 | 0.347826 | 1.0 | 5 |
58 | 1.000000 | 0.347826 | 1.0 | 6 |
59 | 1.000000 | 0.347826 | 1.0 | 7 |
41 | 0.897436 | 0.608696 | 0.7 | 7 |
25 | 0.894737 | 0.739130 | 0.5 | 3 |
34 | 0.867347 | 0.739130 | 0.6 | 6 |
35 | 0.739130 | 0.739130 | 0.6 | 7 |
26 | 0.702479 | 0.739130 | 0.5 | 4 |
18 | 0.570470 | 0.739130 | 0.4 | 2 |
27 | 0.531250 | 0.739130 | 0.5 | 5 |
28 | 0.431472 | 0.739130 | 0.5 | 6 |
12 | 0.362319 | 0.869565 | 0.3 | 2 |
29 | 0.349794 | 0.739130 | 0.5 | 7 |
19 | 0.344130 | 0.739130 | 0.4 | 3 |
20 | 0.319489 | 0.869565 | 0.4 | 4 |
21 | 0.252525 | 0.869565 | 0.4 | 5 |
13 | 0.221239 | 0.869565 | 0.3 | 3 |
6 | 0.220751 | 0.869565 | 0.2 | 2 |
22 | 0.214592 | 0.869565 | 0.4 | 6 |
14 | 0.189456 | 1.000000 | 0.3 | 4 |
23 | 0.184502 | 0.869565 | 0.4 | 7 |
15 | 0.152722 | 1.000000 | 0.3 | 5 |
7 | 0.150721 | 1.000000 | 0.2 | 3 |
16 | 0.132948 | 1.000000 | 0.3 | 6 |
8 | 0.119295 | 1.000000 | 0.2 | 4 |
0 | 0.116162 | 1.000000 | 0.1 | 2 |
17 | 0.112414 | 1.000000 | 0.3 | 7 |
9 | 0.095594 | 1.000000 | 0.2 | 5 |
10 | 0.081100 | 1.000000 | 0.2 | 6 |
11 | 0.073248 | 1.000000 | 0.2 | 7 |
1 | 0.067488 | 1.000000 | 0.1 | 3 |
2 | 0.051271 | 1.000000 | 0.1 | 4 |
3 | 0.041910 | 1.000000 | 0.1 | 5 |
4 | 0.036073 | 1.000000 | 0.1 | 6 |
5 | 0.032643 | 1.000000 | 0.1 | 7 |
wave_lengths = [1, 15, 60]
noises = [0.5, 0.25, 0.1]
anomaly_counts = [1, 5, 10]
anomaly_normal_ratios = {1: 0.000025, 5: 0.00013, 10: 0.00024}
nsensors = [1, 2, 3]
for wave_length in wave_lengths:
for noise in noises:
for sensors in nsensors:
for anomaly_count in anomaly_counts:
anomaly_normal_ratio = anomaly_normal_ratios[anomaly_count]
df_results = get_results(df1, noise, anomaly_count, anomaly_normal_ratio, wave_length, sensors, df_results, 0.5, 2)
df_results_pl = df_results.copy()
df_results_pl.loc[df_results_pl["precision"] > 1, "precision"] = 1
df_results_pl.sort_values(['precision', 'recall'], ascending=[False, False])
df_results_pl.to_csv("dbscan_multi_res.csv", index = False)
df_results_pl['precision'].mean()
0.8851496342284881
df_results_pl['recall'].mean()
0.8140926959487169
df_results3 = pd.DataFrame(columns=['data', 'noise', 'nsensors', 'anomaly count', 'wave len', 'precision', 'recall', 'anomaly_normal_ratio', 'noise direction', 'eps', 'min_samples'])
for wave_length in wave_lengths:
for noise in noises:
for sensors in nsensors:
for anomaly_count in anomaly_counts:
anomaly_normal_ratio = anomaly_normal_ratios[anomaly_count]
df_results3 = get_results(df1, noise, anomaly_count, anomaly_normal_ratio, wave_length, sensors, df_results3, 0.6, 5)
df_results_pl3 = df_results3.copy()
df_results_pl3.loc[df_results_pl3["precision"] > 1, "precision"] = 1
df_results_pl3.sort_values(['precision', 'recall'], ascending=[False, False])
df_results_pl3['precision'].mean()
0.8307340734332637
df_results_pl3['recall'].mean()
0.8069424502574994
Test
df2 = df1.copy()
df2, anomaly_points2 = lib.preprocess.anomaly.apply_anomaly(df2, noise = 0.5, anomaly_normal_ratio = 0.00013, noise_direction = 'n', random_state = 300, wave = True, wave_length = 15, nsensors = 3)
lib.preprocess.anomaly.plot_anomaly(df2, anomaly_points2)
anomaly_points2['Hid_emn_bas_act_binned'].append([42549, 42548, 42441, 42457, 42547, 42442, 42456, 42527, 42443,
42546, 42455, 42545, 42444, 42544, 42528, 42543, 42542, 42541,
42540, 42539, 42538, 42529, 42537, 42536, 42454, 42535, 42530,
42534, 42445, 42533, 42531, 42532, 42453, 42446, 42452, 42451,
42447, 42450, 42449, 42448])
Hid_emn_bas_act_binned
5 ANOMALIES
Hid_Pomp_Motor_Rulman_binned
5 ANOMALIES
Hid_Pomp_Balans_binned
5 ANOMALIES
知乎学术咨询:https://www.zhihu.com/consult/people/792359672131756032?isMe=1
担任《Mechanical System and Signal Processing》等审稿专家,擅长领域:信号滤波/降噪,机器学习/深度学习,时间序列预分析/预测,设备故障诊断/缺陷检测/异常检测。
分割线
基于自编码器半监督机器学习技术的时间序列异常检测-传感器数据(三缸泵输出压力)为例(MATLAB R2021B)
完整代码可通过知乎学术咨询获得:
https://www.zhihu.com/consult/people/792359672131756032?isMe=1