2020吴恩达 machine learning 编程作业 python实现 ex8

# -*- coding: utf-8 -*-
"""
Created on Sat Jul  4 01:40:18 2020

@author: cheetah023
"""

import numpy as np
import scipy.io as sci
import matplotlib.pyplot as plt

#函数定义
def estimateGaussian(X):
    m,n = X.shape
    mu = np.zeros([n,1])
    sigma2 = np.zeros([n])
    mu = np.sum(X,axis=0).T / m
    for i in range(0,n):
        sigma2[i] = np.sum((X[:,i] - mu[i]) ** 2) / m
    return mu, sigma2
def multivariateGaussian(X, mu, sigma2):
    k = len(mu)
    m = X.shape[0]
    
    if np.ndim(sigma2) == 1:
        sigma2 = np.diag(sigma2)
    right = np.zeros([m,1])
    left = np.power(2*np.pi,-(k/2)) 
    left = left * np.power(np.linalg.det(sigma2),-(1/2))
    for i in range(0,m):
        right[i] = np.exp(-(1/2) * np.dot(np.dot((X[i] - mu).T,np.linalg.inv(sigma2)),X[i]-mu))
    p = left * right
    return p
def visualizeFit(X,  mu, sigma2):
    x1 = np.linspace(0,30,50)
    x2 = np.linspace(0,30,50)
    xx,yy = np.meshgrid(x1,x2)
    X_temp = np.column_stack([xx.flatten(),yy.flatten()])
    print('X_temp:',X_temp.shape)
    p = multivariateGaussian(X_temp, mu, sigma2)
    p = np.reshape(p,xx.shape)
    levels = [10**h for h in range(-20,0,3)]
    plt.contour(xx,yy,p,levels)
def selectThreshold(yval, pval):
    epsilons = np.linspace(min(pval),max(pval),1000)
    bestEpsilon = 0
    bestF1 = 0
    for epsilon in epsilons:
        tp=fp=fn=0
        for i in range(0,len(pval)):
            if pval[i] < epsilon and yval[i] == 1:
                tp += 1
            elif pval[i] < epsilon and yval[i] == 0:
                fp += 1
            elif pval[i] >= epsilon and yval[i] == 1:
                fn += 1
        if tp + fp:
            prec = tp / (tp + fp)
            rec = tp / (tp + fn)
            F1 = 2 * (prec * rec) / (prec + rec)
        else:
            F1 = 0
        if F1 > bestF1:
            bestEpsilon = epsilon
            bestF1 = F1
    return bestEpsilon,bestF1
#Part 1: Load Example Dataset
data = sci.loadmat('ex8data1.mat')
#print('data.keys',data.keys())
X = data['X']
Xval = data['Xval']
yval = data['yval']
#print('X:',X.shape)
#print('Xval:',Xval.shape)
#print('yval:',yval.shape)
plt.figure(0)
plt.scatter(X[:,0],X[:,1],marker='x',c='b')
plt.xlabel('Latency (ms)')
plt.ylabel('Throughput (mb/s)')

#Part 2: Estimate the dataset statistics
[mu,sigma2] = estimateGaussian(X)
p = multivariateGaussian(X, mu, sigma2)
visualizeFit(X,  mu, sigma2)

#Part 3: Find Outliers
pval = multivariateGaussian(Xval, mu, sigma2)
[epsilon, F1] = selectThreshold(yval, pval)
print('Best epsilon found using cross-validation:', epsilon)
print('Best F1 on Cross Validation Set:', F1)
print('(you should see a value epsilon of about 8.99e-05)')
print('(you should see a Best F1 value of  0.875000)\n')

outliers = np.where(p < epsilon)
plt.scatter(X[outliers[0],0],X[outliers[0],1],marker='o',edgecolor='r')

#Part 4: Multidimensional Outliers
data = sci.loadmat('ex8data2.mat')
#print('data.keys',data.keys())
X = data['X']
Xval = data['Xval']
yval = data['yval']
#print('X:',X.shape)
#print('Xval:',Xval.shape)
#print('yval:',yval.shape)
[mu, sigma2] = estimateGaussian(X)
p = multivariateGaussian(X, mu, sigma2)
pval = multivariateGaussian(Xval, mu, sigma2)
[epsilon, F1] = selectThreshold(yval, pval)
print('for ex8data2.mat:')
print('Best epsilon found using cross-validation:', epsilon)
print('Best F1 on Cross Validation Set:', F1)
print('(you should see a value epsilon of about 1.38e-18)')
print('(you should see a Best F1 value of 0.615385)')
print('# Outliers found:', sum(p < epsilon))



运行结果:

X_temp: (2500, 2)
Best epsilon found using cross-validation: [8.99985263e-05]
Best F1 on Cross Validation Set: 0.8750000000000001
(you should see a value epsilon of about 8.99e-05)
(you should see a Best F1 value of  0.875000)

for ex8data2.mat:
Best epsilon found using cross-validation: [1.3786075e-18]
Best F1 on Cross Validation Set: 0.6153846153846154
(you should see a value epsilon of about 1.38e-18)
(you should see a Best F1 value of 0.615385)
# Outliers found: [117]

总结:

1、在实现selectThreshold的时候本来想用矩阵的逻辑运算来实现,没成功。还是使用的两个for循环,不过看着还行逻辑比较清楚

2、在计算多变量高斯分布multivariateGaussian()时,使用的是这个公式

 

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值