Necessary Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
import seaborn as sns
Reading and understanding the Data
# Read the CSV file into a DataFrame
sensor_df = pd.read_csv("sensor.csv")
sensor_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220320 entries, 0 to 220319
Data columns (total 55 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Unnamed: 0 220320 non-null int64
1 timestamp 220320 non-null object
2 sensor_00 210112 non-null float64
3 sensor_01 219951 non-null float64
4 sensor_02 220301 non-null float64
5 sensor_03 220301 non-null float64
6 sensor_04 220301 non-null float64
7 sensor_05 220301 non-null float64
8 sensor_06 215522 non-null float64
9 sensor_07 214869 non-null float64
10 sensor_08 215213 non-null float64
11 sensor_09 215725 non-null float64
12 sensor_10 220301 non-null float64
13 sensor_11 220301 non-null float64
14 sensor_12 220301 non-null float64
15 sensor_13 220301 non-null float64
16 sensor_14 220299 non-null float64
17 sensor_15 0 non-null float64
18 sensor_16 220289 non-null float64
19 sensor_17 220274 non-null float64
20 sensor_18 220274 non-null float64
21 sensor_19 220304 non-null float64
22 sensor_20 220304 non-null float64
23 sensor_21 220304 non-null float64
24 sensor_22 220279 non-null float64
25 sensor_23 220304 non-null float64
26 sensor_24 220304 non-null float64
27 sensor_25 220284 non-null float64
28 sensor_26 220300 non-null float64
29 sensor_27 220304 non-null float64
30 sensor_28 220304 non-null float64
31 sensor_29 220248 non-null float64
32 sensor_30 220059 non-null float64
33 sensor_31 220304 non-null float64
34 sensor_32 220252 non-null float64
35 sensor_33 220304 non-null float64
36 sensor_34 220304 non-null float64
37 sensor_35 220304 non-null float64
38 sensor_36 220304 non-null float64
39 sensor_37 220304 non-null float64
40 sensor_38 220293 non-null float64
41 sensor_39 220293 non-null float64
42 sensor_40 220293 non-null float64
43 sensor_41 220293 non-null float64
44 sensor_42 220293 non-null float64
45 sensor_43 220293 non-null float64
46 sensor_44 220293 non-null float64
47 sensor_45 220293 non-null float64
48 sensor_46 220293 non-null float64
49 sensor_47 220293 non-null float64
50 sensor_48 220293 non-null float64
51 sensor_49 220293 non-null float64
52 sensor_50 143303 non-null float64
53 sensor_51 204937 non-null float64
54 machine_status 220320 non-null object
dtypes: float64(52), int64(1), object(2)
memory usage: 92.5+ MB
sensor_df.head()
sensor_df.nunique()
Unnamed: 0 220320
timestamp 220320
sensor_00 1253
sensor_01 831
sensor_02 831
sensor_03 588
sensor_04 7844
sensor_05 190751
sensor_06 812
sensor_07 531
sensor_08 626
sensor_09 565
sensor_10 198804
sensor_11 196368
sensor_12 187145
sensor_13 191983
sensor_14 94564
sensor_15 0
sensor_16 110522
sensor_17 148000
sensor_18 152603
sensor_19 100422
sensor_20 92129
sensor_21 131083
sensor_22 126401
sensor_23 119286
sensor_24 133778
sensor_25 165999
sensor_26 179718
sensor_27 203198
sensor_28 189278
sensor_29 201908
sensor_30 2070
sensor_31 2685
sensor_32 205414
sensor_33 200855
sensor_34 203895
sensor_35 201781
sensor_36 201437
sensor_37 188898
sensor_38 602
sensor_39 909
sensor_40 878
sensor_41 594
sensor_42 699
sensor_43 713
sensor_44 628
sensor_45 675
sensor_46 845
sensor_47 621
sensor_48 1440
sensor_49 827
sensor_50 1135
sensor_51 1108
machine_status 3
dtype: int64
sensor_df['machine_status'].value_counts(dropna=False).head()
NORMAL 205836
RECOVERING 14477
BROKEN 7
Name: machine_status, dtype: int64
Data Cleaning and Preprocessing
- Remove redundant columns
- Remove duplicates
- Handle missing values
- Convert data types to the correct data type
# Drop duplicates
sensor_df = sensor_df.drop_duplicates()
sensor_df.shape
(220320, 55)
# Entire "sensor_15" column is NaN therefore removing the entire column from the data set
del sensor_df['sensor_15']
del sensor_df['Unnamed: 0']
# Function that calculates the percentage of missing values
def calc_percent_NAs(df):
nans = pd.DataFrame(df.isnull().sum().sort_values(ascending=False)/len(df), columns=['percent'])
idx = nans['percent'] > 0
nans['percent'] = nans['percent'].apply(lambda x: '{:.2%}'.format(x))
return nans[idx]
calc_percent_NAs(sensor_df).head(15)
percent | |
sensor_50 | 34.96% |
sensor_51 | 6.98% |
sensor_00 | 4.63% |
sensor_07 | 2.47% |
sensor_08 | 2.32% |
sensor_06 | 2.18% |
sensor_09 | 2.09% |
sensor_01 | 0.17% |
sensor_30 | 0.12% |
sensor_29 | 0.03% |
sensor_32 | 0.03% |
sensor_17 | 0.02% |
sensor_18 | 0.02% |
sensor_22 | 0.02% |
sensor_25 | 0.02% |
# For the sensors that have more then 2% of missing vaues, we will use the mean of the values, the rest we'll drop
sensor_df['sensor_50'].fillna((sensor_df['sensor_50'].mean()), inplace=True)
sensor_df['sensor_51'].fillna((sensor_df['sensor_51'].mean()), inplace=True)
sensor_df['sensor_00'].fillna((sensor_df['sensor_00'].mean()), inplace=True)
sensor_df['sensor_08'].fillna((sensor_df['sensor_07'].mean()), inplace=True)
sensor_df['sensor_07'].fillna((sensor_df['sensor_08'].mean()), inplace=True)
sensor_df['sensor_06'].fillna((sensor_df['sensor_06'].mean()), inplace=True)
sensor_df['sensor_09'].fillna((sensor_df['sensor_09'].mean()), inplace=True)
calc_percent_NAs(sensor_df).head(15)
percent | |
sensor_01 | 0.17% |
sensor_30 | 0.12% |
sensor_29 | 0.03% |
sensor_32 | 0.03% |
sensor_18 | 0.02% |
sensor_17 | 0.02% |
sensor_22 | 0.02% |
sensor_25 | 0.02% |
sensor_16 | 0.01% |
sensor_45 | 0.01% |
sensor_47 | 0.01% |
sensor_38 | 0.01% |
sensor_48 | 0.01% |
sensor_49 | 0.01% |
sensor_39 | 0.01% |
Cleaned_df = sensor_df.dropna()
Cleaned_df['date'] = pd.to_datetime(Cleaned_df['timestamp'])
del Cleaned_df['timestamp']
Cleaned_df = Cleaned_df.set_index('date')
Cleaned_df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 219521 entries, 2018-04-01 00:00:00 to 2018-08-31 23:59:00
Data columns (total 52 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 sensor_00 219521 non-null float64
1 sensor_01 219521 non-null float64
2 sensor_02 219521 non-null float64
3 sensor_03 219521 non-null float64
4 sensor_04 219521 non-null float64
5 sensor_05 219521 non-null float64
6 sensor_06 219521 non-null float64
7 sensor_07 219521 non-null float64
8 sensor_08 219521 non-null float64
9 sensor_09 219521 non-null float64
10 sensor_10 219521 non-null float64
11 sensor_11 219521 non-null float64
12 sensor_12 219521 non-null float64
13 sensor_13 219521 non-null float64
14 sensor_14 219521 non-null float64
15 sensor_16 219521 non-null float64
16 sensor_17 219521 non-null float64
17 sensor_18 219521 non-null float64
18 sensor_19 219521 non-null float64
19 sensor_20 219521 non-null float64
20 sensor_21 219521 non-null float64
21 sensor_22 219521 non-null float64
22 sensor_23 219521 non-null float64
23 sensor_24 219521 non-null float64
24 sensor_25 219521 non-null float64
25 sensor_26 219521 non-null float64
26 sensor_27 219521 non-null float64
27 sensor_28 219521 non-null float64
28 sensor_29 219521 non-null float64
29 sensor_30 219521 non-null float64
30 sensor_31 219521 non-null float64
31 sensor_32 219521 non-null float64
32 sensor_33 219521 non-null float64
33 sensor_34 219521 non-null float64
34 sensor_35 219521 non-null float64
35 sensor_36 219521 non-null float64
36 sensor_37 219521 non-null float64
37 sensor_38 219521 non-null float64
38 sensor_39 219521 non-null float64
39 sensor_40 219521 non-null float64
40 sensor_41 219521 non-null float64
41 sensor_42 219521 non-null float64
42 sensor_43 219521 non-null float64
43 sensor_44 219521 non-null float64
44 sensor_45 219521 non-null float64
45 sensor_46 219521 non-null float64
46 sensor_47 219521 non-null float64
47 sensor_48 219521 non-null float64
48 sensor_49 219521 non-null float64
49 sensor_50 219521 non-null float64
50 sensor_51 219521 non-null float64
51 machine_status 219521 non-null object
dtypes: float64(51), object(1)
memory usage: 88.8+ MB
Exploratory Data Analysis
Overview of the relationship between the sensors
#Correlation between sensors
#We will be using only the lower triangle from the correlation matrix to avoid redundancy
df_corr_lt = Cleaned_df.corr().where(np.tril(np.ones(Cleaned_df.corr().shape),k=-1).astype(np.bool))
df_corr_lt=round(df_corr_lt,ndigits=2)
plt.figure(figsize=(30,30))
sns.heatmap(df_corr_lt,cmap="Spectral", vmin=-1.0, vmax=1.0,annot=True)
plt.title("Visualizing the correlation between all the sensors:")
Cleaned_df.shape
(219521, 52)
Now let's check the distribution of the values to check if they are all on the same scale
# Histogram
Cleaned_df.hist(Cleaned_df.columns, bins=25, layout=(8,8), figsize=(30, 30))
plt.show()
Variable Reduction and Scalability
variable_red_df = Cleaned_df.drop(['machine_status'], axis=1)
names = variable_red_df.columns
x = Cleaned_df[names]
scaler = StandardScaler()
pca = PCA()
pipeline = make_pipeline(scaler, pca)
pipeline.fit(x)
Pipeline(steps=[('standardscaler', StandardScaler()), ('pca', PCA())])
features = range(pca.n_components_)
_ = plt.figure(figsize=(22, 5))
_ = plt.bar(features, pca.explained_variance_)
_ = plt.xlabel('PCA feature')
_ = plt.ylabel('Variance')
_ = plt.xticks(features)
_ = plt.title("Importance of the Principal Components")
plt.show()
# 2 components represent 75% of the variance
# Calculate PCA with 2 components
pca = PCA(n_components=2)
MainComponents = pca.fit_transform(x)
MainDF = pd.DataFrame(data = MainComponents, columns = ['pc1', 'pc2'])
Cleaned_df['pc1']=pd.Series(MainDF['pc1'].values, index=Cleaned_df.index)
Cleaned_df['pc2']=pd.Series(MainDF['pc2'].values, index=Cleaned_df.index)
Cleaned_df.info()
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 219521 entries, 2018-04-01 00:00:00 to 2018-08-31 23:59:00
Data columns (total 54 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 sensor_00 219521 non-null float64
1 sensor_01 219521 non-null float64
2 sensor_02 219521 non-null float64
3 sensor_03 219521 non-null float64
4 sensor_04 219521 non-null float64
5 sensor_05 219521 non-null float64
6 sensor_06 219521 non-null float64
7 sensor_07 219521 non-null float64
8 sensor_08 219521 non-null float64
9 sensor_09 219521 non-null float64
10 sensor_10 219521 non-null float64
11 sensor_11 219521 non-null float64
12 sensor_12 219521 non-null float64
13 sensor_13 219521 non-null float64
14 sensor_14 219521 non-null float64
15 sensor_16 219521 non-null float64
16 sensor_17 219521 non-null float64
17 sensor_18 219521 non-null float64
18 sensor_19 219521 non-null float64
19 sensor_20 219521 non-null float64
20 sensor_21 219521 non-null float64
21 sensor_22 219521 non-null float64
22 sensor_23 219521 non-null float64
23 sensor_24 219521 non-null float64
24 sensor_25 219521 non-null float64
25 sensor_26 219521 non-null float64
26 sensor_27 219521 non-null float64
27 sensor_28 219521 non-null float64
28 sensor_29 219521 non-null float64
29 sensor_30 219521 non-null float64
30 sensor_31 219521 non-null float64
31 sensor_32 219521 non-null float64
32 sensor_33 219521 non-null float64
33 sensor_34 219521 non-null float64
34 sensor_35 219521 non-null float64
35 sensor_36 219521 non-null float64
36 sensor_37 219521 non-null float64
37 sensor_38 219521 non-null float64
38 sensor_39 219521 non-null float64
39 sensor_40 219521 non-null float64
40 sensor_41 219521 non-null float64
41 sensor_42 219521 non-null float64
42 sensor_43 219521 non-null float64
43 sensor_44 219521 non-null float64
44 sensor_45 219521 non-null float64
45 sensor_46 219521 non-null float64
46 sensor_47 219521 non-null float64
47 sensor_48 219521 non-null float64
48 sensor_49 219521 non-null float64
49 sensor_50 219521 non-null float64
50 sensor_51 219521 non-null float64
51 machine_status 219521 non-null object
52 pc1 219521 non-null float64
53 pc2 219521 non-null float64
dtypes: float64(53), object(1)
memory usage: 92.1+ MB
Check Stationary and Correlation
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
# Function to perform Augmented Dickey-Fuller test
def test_stationarity(timeseries):
print('Results of Dickey-Fuller Test:')
dftest = adfuller(timeseries, autolag='AIC')
dfoutput = pd.Series(dftest[0:4], index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
for key, value in dftest[4].items():
dfoutput['Critical Value (%s)' % key] = value
print(dfoutput)
# Test stationarity for pc1 and pc2
test_stationarity(Cleaned_df['pc1'])
test_stationarity(Cleaned_df['pc2'])
Results of Dickey-Fuller Test:
Test Statistic -4.799371
p-value 0.000055
#Lags Used 82.000000
Number of Observations Used 219438.000000
Critical Value (1%) -3.430380
Critical Value (5%) -2.861553
Critical Value (10%) -2.566777
dtype: float64
Results of Dickey-Fuller Test:
# Plot Autocorrelation for pc1 and pc2
import matplotlib.pyplot as plt
import statsmodels.api as sm
# Number of PCA variables
num_pca_vars = 2
# Setting up the figure for plotting
fig, axes = plt.subplots(nrows=1, ncols=num_pca_vars, figsize=(12, 6)) # Adjust the size as needed
# Iterate through each PCA variable
for i in range(num_pca_vars):
pca_var = f'pc{i+1}'
pca_data = MainDF[pca_var].pct_change()
# Plotting the Autocorrelation for each PCA variable
sm.graphics.tsa.plot_acf(pca_data.dropna(), lags=40, alpha=0.05, ax=axes[i])
axes[i].set_title(f'Autocorrelation for {pca_var}')
plt.tight_layout()
plt.show()
Cleaned_df['machine_status'].value_counts(dropna=False).head()
NORMAL 205067
RECOVERING 14447
BROKEN 7
Name: machine_status, dtype: int64
Cleaned_df['new_status'] = Cleaned_df['machine_status'].apply(lambda x: 'Normal' if x == 'NORMAL' else 'Anomaly')
Cleaned_df['new_status'].value_counts(dropna=False).head()
Normal 205067
Anomaly 14454
Name: new_status, dtype: int64
Statistical Modeling
Base Model: Detect Outliers Using the Interquartile Range (IQR)
# outlier_lower = Q1 - (1.5*IQR)
# outlier_upper = Q3 + (1.5*IQR)
# Calculate outlier bounds for pc1
q1_pc1, q3_pc1 = Cleaned_df['pc1'].quantile([0.25, 0.75])
iqr_pc1 = q3_pc1 - q1_pc1
lower_pc1 = q1_pc1 - (1.5*iqr_pc1)
upper_pc1 = q3_pc1 + (1.5*iqr_pc1)
# Calculate outlier bounds for pc2
q1_pc2, q3_pc2 = Cleaned_df['pc2'].quantile([0.25, 0.75])
iqr_pc2 = q3_pc2 - q1_pc2
lower_pc2 = q1_pc2 - (1.5*iqr_pc2)
upper_pc2 = q3_pc2 + (1.5*iqr_pc2)
print("(", lower_pc1,"," , upper_pc1,")")
print("(", lower_pc2,"," , upper_pc2,")")
( -736.3329463390996 , 222.73590195336766 )
( -738.2364757059686 , 683.9742481802032 )
# Creating the anomaly columns for each pca variable
Cleaned_df['anomaly_pc1_IQR'] = ((Cleaned_df['pc1']>upper_pc1) | (Cleaned_df['pc1']<lower_pc1)).astype('int')
Cleaned_df['anomaly_pc2_IQR'] = ((Cleaned_df['pc2']>upper_pc2) | (Cleaned_df['pc2']<lower_pc2)).astype('int')
# Replacing the integer values with strings
Cleaned_df['anomaly_pc1_IQR'] = Cleaned_df['anomaly_pc1_IQR'].map({0: 'Normal', 1: 'Anomaly'})
Cleaned_df['anomaly_pc2_IQR'] = Cleaned_df['anomaly_pc2_IQR'].map({0: 'Normal', 1: 'Anomaly'})
# Now printing the value counts
anomaly_pc1_counts = Cleaned_df['anomaly_pc1_IQR'].value_counts()
anomaly_pc2_counts = Cleaned_df['anomaly_pc2_IQR'].value_counts()
print(anomaly_pc1_counts)
print(anomaly_pc2_counts)
Normal 189644
Anomaly 29877
Name: anomaly_pc1_IQR, dtype: int64
Normal 217947
Anomaly 1574
Name: anomaly_pc2_IQR, dtype: int64
# Creating a new column 'combined_status'
Cleaned_df['combined_anomalies_IQR'] = np.where((Cleaned_df['anomaly_pc1_IQR'] == 'Anomaly') |
(Cleaned_df['anomaly_pc2_IQR'] == 'Anomaly'),
'Anomaly', 'Normal')
# Now printing the value counts for the combined status
combined_anomaly_counts = Cleaned_df['combined_anomalies_IQR'].value_counts()
print(combined_anomaly_counts)
Normal 188836
Anomaly 30685
Name: combined_anomalies_IQR, dtype: int64
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
# Getting the true and predicted labels
true_labels = Cleaned_df['new_status']
predicted_labels = Cleaned_df['combined_anomalies_IQR']
# Generating the confusion matrix
conf_matrix_IQR = confusion_matrix(true_labels, predicted_labels, labels=['Normal', 'Anomaly'])
TN = conf_matrix_IQR[0][0]
TP = conf_matrix_IQR[1][1]
FN = conf_matrix_IQR[1][0]
FP = conf_matrix_IQR[0][1]
# Calculating accuracy for each class
accuracy_normal = TN / (TN + FP) * 100
accuracy_anomaly = TP / (TP + FN) * 100
# Calculating precision and recall
precision = precision_score(true_labels, predicted_labels, pos_label='Anomaly')
recall = recall_score(true_labels, predicted_labels, pos_label='Anomaly')
# Calculating F1 score
f1 = f1_score(true_labels, predicted_labels, pos_label='Anomaly')
print("Confusion Matrix:\n", conf_matrix_IQR)
print(f"F1 Score: {f1:.2f}")
print(f"Accuracy for class 'Normal': {accuracy_normal:.2f}%")
print(f"Accuracy for class 'Anomaly': {accuracy_anomaly:.2f}%")
[[175837 29230]
[ 12999 1455]]
Accuracy for class 'Normal': 85.75%
Accuracy for class 'Anomaly': 10.07%
Model 2: Logistic Regression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
# Select only the PCA features (pc1 and pc2) and the target variable
X = Cleaned_df[['pc1', 'pc2']] # Features
y = Cleaned_df['new_status'] # Target variable
# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix
# Create a pipeline that first scales the data and then applies logistic regression
pipeline = make_pipeline(StandardScaler(), LogisticRegression(max_iter=5000, class_weight='balanced'))
# Perform 7-fold cross-validation and get predictions
y_pred_L = cross_val_predict(pipeline, X, y_encoded, cv=7)
# Calculate and print the confusion matrix
cm_LR = confusion_matrix(y_encoded, y_pred_L)
print("Confusion Matrix:\n", cm_LR)
# Displaying the results
class_accuracy_LR = 100 * cm_LR.diagonal() / cm_LR.sum(axis=1)
class_labels = label_encoder.classes_
for i, accuracy in enumerate(class_accuracy_LR):
print(f"Accuracy for class '{class_labels[i]}' ({i}): {accuracy:.2f}%")
Confusion Matrix:
[[ 11755 2699]
[ 52099 152968]]
Accuracy for class 'Anomaly' (0): 81.33%
Accuracy for class 'Normal' (1): 74.59%
Model 3: Random Forest
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder, StandardScaler
# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Initialize and train the Random Forest model
# Adjust n_estimators, max_depth etc. as needed
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
y_pred_RF = cross_val_predict(rf_model, X_scaled, y_encoded, cv=7)
# Calculate and print the confusion matrix
cm_RF = confusion_matrix(y_encoded, y_pred_RF)
print("Confusion Matrix:\n", cm_RF)
# Displaying the results
class_accuracy_RF = 100 * cm_RF.diagonal() / cm_RF.sum(axis=1)
class_labels = label_encoder.classes_
for i, accuracy in enumerate(class_accuracy_RF):
print(f"Accuracy for class '{class_labels[i]}' ({i}): {accuracy:.2f}%")
Confusion Matrix:
[[ 5802 8652]
[ 16422 188645]]
Accuracy for class 'Anomaly' (0): 40.14%
Accuracy for class 'Normal' (1): 91.99%
Model 4: Multivariate Gaussian Distribution
from scipy.stats import multivariate_normal
from sklearn.metrics import f1_score,confusion_matrix
def estimate_gaussian(dataset):
mu = np.mean(dataset, axis=0)
sigma = np.cov(dataset.T)
return mu, sigma
def multivariate_gaussian(dataset, mu, sigma):
p = multivariate_normal(mean=mu, cov=sigma)
return p.pdf(dataset)
def select_threshold(probs, test_data):
best_epsilon = 0
best_f1 = 0
f = 0
stepsize = (max(probs) - min(probs)) / 1000;
epsilons = np.arange(min(probs), max(probs), stepsize)
for epsilon in np.nditer(epsilons):
predictions = (probs < epsilon)
f = f1_score(test_data, predictions, average='binary')
if f > best_f1:
best_f1 = f
best_epsilon = epsilon
return best_f1, best_epsilon
# To run this model we decided to include a few more PCA variables to get better results.
pca2 = PCA(0.9)
principalComponents2 = pca2.fit_transform(x)
principalDf2 = pd.DataFrame(data = principalComponents2, columns = ['pc1', 'pc2', 'pc3', 'pc4', 'pc5'])
principalDf2['machine_status'] = pd.Series(Cleaned_df['machine_status'].values, index=principalDf2.index)
principalDf2['anomaly']= np.where(principalDf2['machine_status'] == 'NORMAL', False, True)
X_train_MGD, X_test_MGD, y_train_MGD, y_test_MGD = train_test_split(principalDf2.iloc[:,0:5],principalDf2['anomaly'],test_size=0.20, random_state=42)
mu, sigma = estimate_gaussian(X_train_MGD)
p_tr = multivariate_gaussian(X_train_MGD,mu,sigma)
p_ts = multivariate_gaussian(X_test_MGD,mu,sigma)
fscore, ep = select_threshold(p_ts,y_test_MGD)
print(fscore, ep)
0.6024551463644947 1.7111337393427953e-17
p_all = multivariate_gaussian(principalDf2.iloc[:,0:5],mu,sigma)
principalDf2['anomaly2'] = (p_all < ep).astype(int)
principalDf2['anomaly2'].value_counts()
0 202033
1 17488
Name: anomaly2, dtype: int64
# Generate the confusion matrix
cm_MGD = confusion_matrix(principalDf2['anomaly'], principalDf2['anomaly2'])
# True Negative (TN), False Positive (FP), False Negative (FN), True Positive (TP)
TN, FP, FN, TP = cm_MGD.ravel()
# Calculate accuracy for each class
accuracy_normal = TN / (TN + FP) # Accuracy for class 'Normal' (0)
accuracy_anomaly = TP / (TP + FN) # Accuracy for class 'Anomaly' (1)
# Print the results
print("Confusion Matrix:\n", cm_MGD)
print(f"Accuracy for class 'Normal' (0): {accuracy_normal * 100:.2f}%")
print(f"Accuracy for class 'Anomaly' (1): {accuracy_anomaly * 100:.2f}%")
Confusion Matrix:
[[197174 7893]
[ 4859 9595]]
Accuracy for class 'Normal' (0): 96.15%
Accuracy for class 'Anomaly' (1): 66.38%
知乎学术咨询:https://www.zhihu.com/consult/people/792359672131756032?isMe=1
担任《Mechanical System and Signal Processing》等审稿专家,擅长领域:信号滤波/降噪,机器学习/深度学习,时间序列预分析/预测,设备故障诊断/缺陷检测/异常检测。
分割线
基于卷积核密度估计的时间序列异常检测(Python)
完整代码:mbd.pub/o/bread/mbd-ZpmZlJZw