# -*- coding: utf-8 -*-
"""
Created on Sat Jul 4 01:40:18 2020
@author: cheetah023
"""
import numpy as np
import scipy.io as sci
import matplotlib.pyplot as plt
#函数定义
def estimateGaussian(X):
m,n = X.shape
mu = np.zeros([n,1])
sigma2 = np.zeros([n])
mu = np.sum(X,axis=0).T / m
for i in range(0,n):
sigma2[i] = np.sum((X[:,i] - mu[i]) ** 2) / m
return mu, sigma2
def multivariateGaussian(X, mu, sigma2):
k = len(mu)
m = X.shape[0]
if np.ndim(sigma2) == 1:
sigma2 = np.diag(sigma2)
right = np.zeros([m,1])
left = np.power(2*np.pi,-(k/2))
left = left * np.power(np.linalg.det(sigma2),-(1/2))
for i in range(0,m):
right[i] = np.exp(-(1/2) * np.dot(np.dot((X[i] - mu).T,np.linalg.inv(sigma2)),X[i]-mu))
p = left * right
return p
def visualizeFit(X, mu, sigma2):
x1 = np.linspace(0,30,50)
x2 = np.linspace(0,30,50)
xx,yy = np.meshgrid(x1,x2)
X_temp = np.column_stack([xx.flatten(),yy.flatten()])
print('X_temp:',X_temp.shape)
p = multivariateGaussian(X_temp, mu, sigma2)
p = np.reshape(p,xx.shape)
levels = [10**h for h in range(-20,0,3)]
plt.contour(xx,yy,p,levels)
def selectThreshold(yval, pval):
epsilons = np.linspace(min(pval),max(pval),1000)
bestEpsilon = 0
bestF1 = 0
for epsilon in epsilons:
tp=fp=fn=0
for i in range(0,len(pval)):
if pval[i] < epsilon and yval[i] == 1:
tp += 1
elif pval[i] < epsilon and yval[i] == 0:
fp += 1
elif pval[i] >= epsilon and yval[i] == 1:
fn += 1
if tp + fp:
prec = tp / (tp + fp)
rec = tp / (tp + fn)
F1 = 2 * (prec * rec) / (prec + rec)
else:
F1 = 0
if F1 > bestF1:
bestEpsilon = epsilon
bestF1 = F1
return bestEpsilon,bestF1
#Part 1: Load Example Dataset
data = sci.loadmat('ex8data1.mat')
#print('data.keys',data.keys())
X = data['X']
Xval = data['Xval']
yval = data['yval']
#print('X:',X.shape)
#print('Xval:',Xval.shape)
#print('yval:',yval.shape)
plt.figure(0)
plt.scatter(X[:,0],X[:,1],marker='x',c='b')
plt.xlabel('Latency (ms)')
plt.ylabel('Throughput (mb/s)')
#Part 2: Estimate the dataset statistics
[mu,sigma2] = estimateGaussian(X)
p = multivariateGaussian(X, mu, sigma2)
visualizeFit(X, mu, sigma2)
#Part 3: Find Outliers
pval = multivariateGaussian(Xval, mu, sigma2)
[epsilon, F1] = selectThreshold(yval, pval)
print('Best epsilon found using cross-validation:', epsilon)
print('Best F1 on Cross Validation Set:', F1)
print('(you should see a value epsilon of about 8.99e-05)')
print('(you should see a Best F1 value of 0.875000)\n')
outliers = np.where(p < epsilon)
plt.scatter(X[outliers[0],0],X[outliers[0],1],marker='o',edgecolor='r')
#Part 4: Multidimensional Outliers
data = sci.loadmat('ex8data2.mat')
#print('data.keys',data.keys())
X = data['X']
Xval = data['Xval']
yval = data['yval']
#print('X:',X.shape)
#print('Xval:',Xval.shape)
#print('yval:',yval.shape)
[mu, sigma2] = estimateGaussian(X)
p = multivariateGaussian(X, mu, sigma2)
pval = multivariateGaussian(Xval, mu, sigma2)
[epsilon, F1] = selectThreshold(yval, pval)
print('for ex8data2.mat:')
print('Best epsilon found using cross-validation:', epsilon)
print('Best F1 on Cross Validation Set:', F1)
print('(you should see a value epsilon of about 1.38e-18)')
print('(you should see a Best F1 value of 0.615385)')
print('# Outliers found:', sum(p < epsilon))
运行结果:
X_temp: (2500, 2)
Best epsilon found using cross-validation: [8.99985263e-05]
Best F1 on Cross Validation Set: 0.8750000000000001
(you should see a value epsilon of about 8.99e-05)
(you should see a Best F1 value of 0.875000)
for ex8data2.mat:
Best epsilon found using cross-validation: [1.3786075e-18]
Best F1 on Cross Validation Set: 0.6153846153846154
(you should see a value epsilon of about 1.38e-18)
(you should see a Best F1 value of 0.615385)
# Outliers found: [117]
总结:
1、在实现selectThreshold的时候本来想用矩阵的逻辑运算来实现,没成功。还是使用的两个for循环,不过看着还行逻辑比较清楚
2、在计算多变量高斯分布multivariateGaussian()时,使用的是这个公式