import numpy as np
In [2]:
def F1(predictions, y): TP = np.sum((predictions == 1) & (y == 1)) FP = np.sum((predictions == 1) & (y == 0)) FN = np.sum((predictions == 0) & (y == 1)) if TP + FP == 0: precision = 0 else: precision = float(TP) / (TP + FP) if TP + FN == 0: recall = 0 else: recall = float(TP) / (TP + FN) if precision + recall == 0: return 0 else: return (2.0 * precision * recall) / (precision + recall)
In [3]:
def gaussianModel(X): # 参数估计 m, n = X.shape mu = np.mean(X, axis=0) sigma2 = np.var(X, axis=0) def p(x): # x是单个样本,n*1维 total = 1 for j in range(x.shape[0]): total *= np.exp(-np.power((x[j, 0] - mu[0, j]), 2) / (2 * sigma2[0, j]) ) / (np.sqrt(2 * np.pi * sigma2[0, j])) return total return p
In [20]:
x.shape
Out[20]:
(1, 11)
In [19]:
def multivariateGaussianModel(X): # 参数估计 m, n = X.shape mu = np.mean(X.T, axis=1) Sigma = np.mat(np.cov(X.T)) detSigma = np.linalg.det(Sigma) def p(x): x = x - mu return np.exp(-x.T * np.linalg.pinv(Sigma) * x / 2).A[0] * \ ((2*np.pi)**(-n/2.0) * (detSigma**(-0.5) )) return p
In [5]:
def train(X, model=gaussianModel): return model(X) # 返回的是概率模型p
In [6]:
def selectEpsilon(XVal, yVal, p): pVal = np.mat([p(x.T) for x in XVal]).reshape(-1, 1) # 交叉验证集中所有样本的概率 step = (np.max(pVal) - np.min(pVal)) / 1000.0 bestEpsilon = 0 bestF1 = 0 for epsilon in np.arange(np.min(pVal), np.max(pVal), step): predictions = pVal < epsilon f1 = F1(predictions, yVal) if f1 > bestF1: bestF1 = f1 bestEpsilon = epsilon return bestEpsilon, bestF1
In [7]:
%matplotlib inline
In [8]:
from scipy.io import loadmat import matplotlib.pyplot as plt
In [9]:
plt.rcParams['font.sans-serif']=['SimHei'] plt.rcParams['axes.unicode_minus']=False
In [10]:
# 低维数据测试 data = loadmat('data/ex8data1.mat') X = np.mat(data['X']) XVal = np.mat(data['Xval']) yVal = np.mat(data['yval']) p = train(X) #p = train(X, model=multivariateGaussianModel) pTest = np.mat([p(x.T) for x in X]).reshape(-1, 1) # 绘制数据点 plt.xlabel(u'延迟 (ms)') plt.ylabel(u'吞吐 (mb/s)') plt.plot(X[:, 0], X[:, 1], 'bx') epsilon, f1 = selectEpsilon(XVal, yVal, p) print u'基于交叉验证集最佳ε: %e\n'%epsilon print u'基于交叉验证集最佳F1: %f\n'%f1 print u'找到 %d 个异常点' % np.sum(pTest < epsilon) # 获得训练集的异常点 outliers = np.where(pTest < epsilon, True, False).ravel() plt.plot(X[outliers, 0], X[outliers, 1], 'ro', lw=2, markersize=10, fillstyle='none', markeredgewidth=1) n = np.linspace(0, 35, 100) X1 = np.meshgrid(n,n) XFit = np.mat(np.column_stack((X1[0].T.flatten(), X1[1].T.flatten()))) pFit = np.mat([p(x.T) for x in XFit]).reshape(-1, 1) pFit = pFit.reshape(X1[0].shape) if not np.isinf(np.sum(pFit)): plt.contour(X1[0], X1[1], pFit, 10.0**np.arange(-20, 0, 3).T) plt.show()
基于交叉验证集最佳ε: 8.990853e-05 基于交叉验证集最佳F1: 0.875000 找到 6 个异常点
/Users/sunkepeng/anaconda2/lib/python2.7/site-packages/matplotlib/font_manager.py:1331: UserWarning: findfont: Font family [u'sans-serif'] not found. Falling back to DejaVu Sans (prop.get_family(), self.defaultFamily[fontext]))
In [11]:
# 高维数据 data = loadmat('data/ex8data2.mat') X = np.mat(data['X']) XVal = np.mat(data['Xval']) yVal = np.mat(data['yval']) p = train(X) #p = train(X, model=multivariateGaussianModel) pTest = np.mat([p(x.T) for x in X]).reshape(-1, 1) epsilon, f1 = selectEpsilon(XVal, yVal, p) print 'Best epsilon found using cross-validation: %e\n'%epsilon print 'Best F1 on Cross Validation Set: %f\n'%f1 print '# Outliers found: %d' % np.sum(pTest < epsilon)
Best epsilon found using cross-validation: 1.377229e-18 Best F1 on Cross Validation Set: 0.615385 # Outliers found: 117