用python解释异常检测模型

import numpy as np

In [2]:

def F1(predictions, y):

    TP = np.sum((predictions == 1) & (y == 1))
    FP = np.sum((predictions == 1) & (y == 0))
    FN = np.sum((predictions == 0) & (y == 1))
    if TP + FP == 0:
        precision = 0
    else:
        precision = float(TP) / (TP + FP)
    if TP + FN == 0:
        recall = 0
    else:
        recall = float(TP) / (TP + FN)
    if precision + recall == 0:
        return 0
    else:
        return (2.0 * precision * recall) / (precision + recall)

In [3]:

def gaussianModel(X):

    # 参数估计
    m, n = X.shape
    mu = np.mean(X, axis=0)
    sigma2 = np.var(X, axis=0)
    
    def p(x): # x是单个样本,n*1维
        total = 1
        for j in range(x.shape[0]):
            total *= np.exp(-np.power((x[j, 0] - mu[0, j]), 2) / (2 * sigma2[0, j])
                            ) / (np.sqrt(2 * np.pi * sigma2[0, j]))
        return total
    return p

In [20]:

x.shape

Out[20]:

(1, 11)

In [19]:

def multivariateGaussianModel(X):

    # 参数估计
    m, n = X.shape
    mu = np.mean(X.T, axis=1)
    Sigma = np.mat(np.cov(X.T))
    detSigma = np.linalg.det(Sigma)

    def p(x):
        x = x - mu
        return np.exp(-x.T * np.linalg.pinv(Sigma) * x / 2).A[0] * \
            ((2*np.pi)**(-n/2.0) * (detSigma**(-0.5) ))
    return p

In [5]:

def train(X, model=gaussianModel):
    return model(X) # 返回的是概率模型p

In [6]:

def selectEpsilon(XVal, yVal, p):
    
    pVal = np.mat([p(x.T) for x in XVal]).reshape(-1, 1) # 交叉验证集中所有样本的概率
    step = (np.max(pVal) - np.min(pVal)) / 1000.0
    
    bestEpsilon = 0
    bestF1 = 0
    
    for epsilon in np.arange(np.min(pVal), np.max(pVal), step):
        predictions = pVal < epsilon
        f1 = F1(predictions, yVal)
        if f1 > bestF1:
            bestF1 = f1
            bestEpsilon = epsilon
    return bestEpsilon, bestF1

In [7]:

%matplotlib inline

In [8]:

from scipy.io import loadmat
import matplotlib.pyplot as plt

In [9]:

plt.rcParams['font.sans-serif']=['SimHei']
plt.rcParams['axes.unicode_minus']=False

In [10]:

# 低维数据测试
data = loadmat('data/ex8data1.mat')
X = np.mat(data['X'])
XVal = np.mat(data['Xval'])
yVal = np.mat(data['yval'])

p = train(X)
#p = train(X, model=multivariateGaussianModel)
pTest = np.mat([p(x.T) for x in X]).reshape(-1, 1)

# 绘制数据点
plt.xlabel(u'延迟 (ms)')
plt.ylabel(u'吞吐 (mb/s)')
plt.plot(X[:, 0], X[:, 1], 'bx')
epsilon, f1 = selectEpsilon(XVal, yVal, p)

print u'基于交叉验证集最佳ε: %e\n'%epsilon
print u'基于交叉验证集最佳F1:  %f\n'%f1
print u'找到 %d 个异常点' % np.sum(pTest < epsilon)

# 获得训练集的异常点
outliers = np.where(pTest < epsilon, True, False).ravel()
plt.plot(X[outliers, 0], X[outliers, 1], 'ro', lw=2, markersize=10, fillstyle='none', markeredgewidth=1)
n = np.linspace(0, 35, 100)
X1 = np.meshgrid(n,n)
XFit = np.mat(np.column_stack((X1[0].T.flatten(), X1[1].T.flatten())))
pFit = np.mat([p(x.T) for x in XFit]).reshape(-1, 1)
pFit = pFit.reshape(X1[0].shape)

if not np.isinf(np.sum(pFit)):
    plt.contour(X1[0], X1[1], pFit, 10.0**np.arange(-20, 0, 3).T)
plt.show()
基于交叉验证集最佳ε: 8.990853e-05

基于交叉验证集最佳F1:  0.875000

找到 6 个异常点
/Users/sunkepeng/anaconda2/lib/python2.7/site-packages/matplotlib/font_manager.py:1331: UserWarning: findfont: Font family [u'sans-serif'] not found. Falling back to DejaVu Sans
  (prop.get_family(), self.defaultFamily[fontext]))

In [11]:

# 高维数据
data = loadmat('data/ex8data2.mat')
X = np.mat(data['X'])
XVal = np.mat(data['Xval'])
yVal = np.mat(data['yval'])

p = train(X)
#p = train(X, model=multivariateGaussianModel)
pTest = np.mat([p(x.T) for x in X]).reshape(-1, 1)

epsilon, f1 = selectEpsilon(XVal, yVal, p)

print 'Best epsilon found using cross-validation: %e\n'%epsilon
print 'Best F1 on Cross Validation Set:  %f\n'%f1
print '# Outliers found: %d' % np.sum(pTest < epsilon)
Best epsilon found using cross-validation: 1.377229e-18

Best F1 on Cross Validation Set:  0.615385

# Outliers found: 117
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值