Hung-Yi Lee homework[10]: Anomaly Detection
一、Anomaly Detection原理介绍
Anomaly Detection:异常侦测。如下图所示,如果此时的输入x和训练数据很像,经过异常侦测后,我们判断它为“正常”数据,反之则判断为“异常”数据。
Anomaly Detection方法:
- 用classfier的信心分数。【虽然看起来很简单,但其实效果不错】
- 用AutoEncoder的方式。训练一个autoencoder,当图片是异常图片时,decoder得到的图片和原图片相差越大。
- 用K-means的方式。正常数据与所在类的中心的距离比异常数据与所在类中心的距离要小。
- 用PCA的方式。计算训练数据的主成分,将测试数据投影在这些成分上,再将这些投影重建,对重建的图片和原图进行平方差的计算,正常数据的平方差结果比异常数据的平方差结果要小。
在判断异常侦测系统好坏的时候,不经常使用正确率【因为异常数据和正常数据比例相差极为悬殊,一个不太work的异常侦测系统也有可能有很高的正确率】。一般使用AUC来衡量异常侦测系统的好坏。
二、Anomaly Detection实现
2.1 AutoEncoder方式
ae.py
import numpy as np
import torch
from torch import nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import DataLoader
from torch.optim import Adam, AdamW
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
TensorDataset)
class fcn_autoencoder(nn.Module):
def __init__(self):
super(fcn_autoencoder, self).__init__()
self.encoder = nn.Sequential(
nn.Linear(32 * 32 * 3, 128),
nn.ReLU(True),
nn.Linear(128, 64),
nn.ReLU(True), nn.Linear(64, 12), nn.ReLU(True), nn.Linear(12, 3))
self.decoder = nn.Sequential(
nn.Linear(3, 12),
nn.ReLU(True),
nn.Linear(12, 64),
nn.ReLU(True),
nn.Linear(64, 128),
nn.ReLU(True), nn.Linear(128, 32 * 32 * 3
), nn.Tanh())
def forward(self, x):
x = self.encoder(x)
x = self.decoder(x)
return x
class conv_autoencoder(nn.Module):
def __init__(self):
super(conv_autoencoder, self).__init__()
self.encoder = nn.Sequential(
nn.Conv2d(3, 12, 4, stride=2, padding=1), # [batch, 12, 16, 16]
nn.ReLU(),
nn.Conv2d(12, 24, 4, stride=2, padding=1), # [batch, 24, 8, 8]
nn.ReLU(),
nn.Conv2d(24, 48, 4, stride=2, padding=1), # [batch, 48, 4, 4]
nn.ReLU(),
)
self.decoder = nn.Sequential(
nn.ConvTranspose2d(48, 24, 4, stride=2, padding=1), # [batch, 24, 8, 8]
nn.ReLU(),
nn.ConvTranspose2d(24, 12, 4, stride=2, padding=1), # [batch, 12, 16, 16]
nn.ReLU(),
nn.ConvTranspose2d(12, 3, 4, stride=2, padding=1), # [batch, 3, 32, 32]
nn.Tanh(),
)
def forward(self, x):
x = self.encoder(x)
x = self.decoder(x)
return x
class VAE(nn.Module):
def __init__(self):
super(VAE, self).__init__()
self.fc1 = nn.Linear(32*32*3, 400)
self.fc21 = nn.Linear(400, 20)
self.fc22 = nn.Linear(400, 20)
self.fc3 = nn.Linear(20, 400)
self.fc4 = nn.Linear(400, 32*32*3)
def encode(self, x):
h1 = F.relu(self.fc1(x))
return self.fc21(h1), self.fc22(h1)
def reparametrize(self, mu, logvar):
std = logvar.mul(0.5).exp_()
if torch.cuda.is_available():
eps = torch.cuda.FloatTensor(std.size()).normal_()
else:
eps = torch.FloatTensor(std.size()).normal_()
eps = Variable(eps)
return eps.mul(std).add_(mu)
def decode(self, z):
h3 = F.relu(self.fc3(z))
return F.sigmoid(self.fc4(h3))
def forward(self, x):
mu, logvar = self.encode(x)
z = self.reparametrize(mu, logvar)
return self.decode(z), mu, logvar
def loss_vae(recon_x, x, mu, logvar, criterion):
mse = criterion(recon_x, x) # mse loss
KLD_element = mu.pow(2).add_(logvar.exp()).mul_(-1).add_(1).add_(logvar)
KLD = torch.sum(KLD_element).mul_(-0.5)
return mse + KLD
if __name__ == "__main__":
train = np.load('train.npy', allow_pickle=True)
test = np.load('test.npy', allow_pickle=True)
num_epochs = 1000
batch_size = 128
learning_rate = 1e-3
# {'fcn', 'cnn', 'vae'}
model_type = 'cnn'
x = train
if model_type == 'fcn' or model_type == 'vae':
x = x.reshape(len(x), -1)
data = torch.tensor(x, dtype=torch.float)
train_dataset = TensorDataset(data)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
model_classes = {'fcn': fcn_autoencoder(), 'cnn': conv_autoencoder(), 'vae': VAE()}
model = model_classes[model_type].cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(
model.parameters(), lr=learning_rate)
best_loss = np.inf
model.train()
for epoch in range(num_epochs):
for data in train_dataloader:
if model_type == 'cnn':
img = data[0].transpose(3, 1).cuda()
else:
img = data[0].cuda()
# ===================forward=====================
output = model(img)
if model_type == 'vae':
loss = loss_vae(output[0], img, output[1], output[2], criterion)
else:
loss = criterion(output, img)
# ===================backward====================
optimizer.zero_grad()
loss.backward()
optimizer.step()
# ===================save====================
if loss.item() < best_loss:
best_loss = loss.item()
torch.save(model, 'best_model_{}.pt'.format(model_type))
# ===================log========================
print('epoch [{}/{}], loss:{:.4f}'
.format(epoch + 1, num_epochs, loss.item()))
ae_predict.py
对重建后的图片和原图计算平方差,并将结果写入prediction.csv
中。
from ae import *
if __name__ == "__main__":
train = np.load('train.npy', allow_pickle=True)
test = np.load('test.npy', allow_pickle=True)
batch_size = 128
# {'fcn', 'cnn', 'vae'}
model_type = 'cnn'
if model_type == 'fcn' or model_type == 'vae':
y = test.reshape(len(test), -1)
else:
y = test
data = torch.tensor(y, dtype=torch.float)
test_dataset = TensorDataset(data)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)
model = torch.load('best_model_{}.pt'.format(model_type), map_location='cuda')
model.eval()
reconstructed = list()
for i, data in enumerate(test_dataloader):
if model_type == 'cnn':
img = data[0].transpose(3, 1).cuda()
else:
img = data[0].cuda()
output = model(img)
if model_type == 'cnn':
output = output.transpose(3, 1)
elif model_type == 'vae':
output = output[0]
reconstructed.append(output.cpu().detach().numpy())
reconstructed = np.concatenate(reconstructed, axis=0)
anomality = np.sqrt(np.sum(np.square(reconstructed - y).reshape(len(y), -1), axis=1))
y_pred = anomality
with open('prediction.csv', 'w') as f:
f.write('id,anomaly\n')
for i in range(len(y_pred)):
f.write('{},{}\n'.format(i + 1, y_pred[i]))
prediction.csv
结果:
2.2 KNN方式
假定已知类别数为5【n_clusters=5
】,先用K-means计算training data中的5个中心,再用这5个中心点对数据进行分群,再计算数据和群中心的距离。
代码中得到的y_pred就是此时数据和他所分到的类的中心的距离值。只要限定一个距离阈值,就可以将异常数据和正常数据分开。
import numpy as np
from sklearn.cluster import MiniBatchKMeans
if __name__ == "__main__":
train = np.load('train.npy', allow_pickle=True)
test = np.load('test.npy', allow_pickle=True)
x = train.reshape(len(train), -1)
y = test.reshape(len(test), -1)
kmeans_x = MiniBatchKMeans(n_clusters=5, batch_size=100).fit(x)
y_cluster = kmeans_x.predict(y)
y_dist = np.sum(np.square(kmeans_x.cluster_centers_[y_cluster] - y), axis=1)
y_pred = y_dist
2.3 PCA方式
首先计算training data的主成分,再将test data投影在这些主成分上,再将这些投影重建,计算重建数据和原数据的MSE。
代码中得到的y_pred就是此时重建数据和原有数据的MSE。只要限定一个阈值,就可以将异常数据和正常数据分开。
import numpy as np
from sklearn.decomposition import PCA
if __name__ == "__main__":
train = np.load('train.npy', allow_pickle=True)
test = np.load('test.npy', allow_pickle=True)
x = train.reshape(len(train), -1)
y = test.reshape(len(test), -1)
pca = PCA(n_components=2).fit(x)
y_projected = pca.transform(y)
y_reconstructed = pca.inverse_transform(y_projected)
dist = np.sqrt(np.sum(np.square(y_reconstructed - y).reshape(len(y), -1), axis=1))
y_pred = dist