数据下载:
链接:https://pan.baidu.com/s/1ePMpmTBeoECBFFgPZQrPAg
提取码:qkg2
两维数据异常检测(使用高斯原始模型)
代码
import numpy as np
from scipy.io import loadmat
import matplotlib.pyplot as plt
mat = loadmat('data/ex8data1.mat')
print(mat.keys())
# dict_keys(['__header__', '__version__', '__globals__', 'X', 'Xval', 'yval'])
X = mat['X']
X_val, y_val = mat['X_val'], mat['y_val']
# print(X.shape, X_val.shape, y_val.shape)
# # ((307, 2), (307, 2), (307, 1))
m, n = X.shape
print(m, n)
plt.figure(figsize=(8, 5))
plt.scatter(X[:, 0], X[:, 1], edgecolors='b')
plt.show()
# 参数估计函数(就是求均值和方差)
def estimate_gaussian(X):
mu = np.mean(X, axis=0) # axis=0表示列,每列的均值
sigma2 = np.var(X, axis=0) # 求每列的方差
return mu, sigma2
def gaussian(x, mu, sigma2): # 高斯原始模型
p = np.mat(np.zeros((m, 1)))
p = 1/np.sqrt(2*np.pi*sigma2)*np.exp(-1/2*(x-mu)**2/sigma2)
return p
def select_threshold(y_val, p_val):
best_f1 = 0
best_epsilon = 0
for epsilon in np.linspace(np.min(p_val), np.max(p_val), 1001):
y_predict = np.zeros(y_val.shape)
y_predict[p_val[:, 0] < epsilon] = 1 # 把小于阈值的设为1(异常)
tp = np.sum(y_predict[y_val == 1]) # 真正例
precision = tp / np.sum(y_predict == 1)
recall = tp / np.sum(y_val == 1)
F1 = (2 * precision * recall) / (precision + recall)
if F1 > best_f1:
best_f1 = F1
best_epsilon = epsilon
return best_epsilon, best_f1
mu, sigma2 = estimate_gaussian(X_val)
p1 = gaussian(X_val[:, 0], mu[0], sigma2[0])
p2 = gaussian(X_val[:, 1], mu[1], sigma2[1])
p_val = np.mat(np.zeros((m, 1)))
for i in range(m):
p_val[i] = p1[i] * p2[i]
print(p_val)
best_epsilon, best_f1 = select_threshold(y_val, p_val)
print(best_epsilon, best_f1)
mu, sigma2 = estimate_gaussian(X)
p3 = gaussian(X[:, 0], mu[0], sigma2[0])
p4 = gaussian(X[:, 1], mu[1], sigma2[1])
p= p3 * p4
p = np.mat(np.zeros((m, 1)))
for i in range(m):
p[i] = p3[i] * p4[i]
xx = np.array([X[i] for i in range(m) if p[i] < best_epsilon])
print(xx)
print(len(xx))
plt.figure(figsize=(8, 5))
plt.scatter(X[:, 0], X[:, 1], edgecolors='b')
plt.scatter(xx[:, 0], xx[:, 1], edgecolors='r')
plt.show()
结果展示
原始数据集展示:
数据异常检测展示: