吴恩达机器学习python作业之异常检测

参考链接:吴恩达|机器学习作业8.0.异常检测_学吧学吧终成学霸的博客-CSDN博客

任务一:对小数据集进行异常检测并可视化

任务二:对大数据集进行异常检测

任务一:对小数据集进行异常检测并可视化

在这里插入图片描述

numpy矩阵求最值、均值、方差、标准差、中值、求和,众数_王同学加油的博客-CSDN博客_矩阵的均值和方差

np.random.normal()详解_YL_python_C++_java的博客-CSDN博客_np.random.normal

python作图之plt.contour详解_你大佬来啦的博客-CSDN博客_plt.contour

方法一:使用原始模型

第一步:找出均值方差并进行可视化

在这里插入图片描述

import numpy as np
from matplotlib import pyplot as plt
from scipy import io

#1.读取数据集
dt = io.loadmat("E:\机器学习\吴恩达\data_sets\ex8data1.mat")
x_train = dt["X"] #(307, 2)
x_val = dt["Xval"] #(307, 2)
y_val = dt["yval"] #(307, 1)

#2.可视化数据集
"""plt.scatter(x_train[:,0],x_train[:,1],marker = "x")
plt.show()"""

#3.定义高斯分布函数与异常检测函数

#求矩阵X的平均值mu与方差sigma2
def estimateGaussian(X):
    #求均值mu
    mu = X.mean(axis = 0)
    #求方差sigma2
    sigma2 = X.var(axis = 0)
    return mu,sigma2

mu_result,sigma2_result = estimateGaussian(x_train)

#origin model
def calculateGaussian(X,mu,sigma2):
    #X的大小是(10000, 2)
    p = np.ones(X.shape[0])
    for i in range(mu.shape[0]):
        p_temp = 1 / (np.sqrt(2 * np.pi * sigma2[i])) * np.exp(-np.power((X[:,i] - mu[i]), 2)/(2*sigma2[i]))
        #p_temp是(10000, 2)
        p = p * p_temp
    return p



#4.可视化数据集与等高线
def dispalyData(X,mu,sigma2):
    a = np.linspace(0, 30, 100) #(100,)
    b = np.linspace(0, 30, 100)
    aa, bb = np.meshgrid(a, b) #(100, 100)
    p = calculateGaussian(np.vstack((aa.ravel(), bb.ravel())).T,mu,sigma2)
    p = p.reshape(aa.shape)
    levels = [10**h for h in range(-20,0,3)]
    #levels是一个包含高度值的一维数组,这样python便会画出传入的高度值对应的等高线。
    plt.scatter(X[:,0],X[:,1], marker="x")
    plt.contour(a,b,p,levels)



dispalyData(x_train,mu_result,sigma2_result)
plt.show()
第二步:根据验证集选择合适的阈值并找出异常点

在这里插入图片描述

import numpy as np
from matplotlib import pyplot as plt
from scipy import io

#1.读取数据集
dt = io.loadmat("E:\机器学习\吴恩达\data_sets\ex8data1.mat")
x_train = dt["X"] #(307, 2)
x_val = dt["Xval"] #(307, 2)
y_val = dt["yval"] #(307, 1)

#2.可视化数据集
"""plt.scatter(x_train[:,0],x_train[:,1],marker = "x")
plt.show()"""

#3.定义高斯分布函数与异常检测函数

#求矩阵X的平均值mu与方差sigma2
def estimateGaussian(X):
    #求均值mu
    mu = X.mean(axis = 0)
    #求方差sigma2
    sigma2 = X.var(axis = 0)
    return mu,sigma2

mu_result,sigma2_result = estimateGaussian(x_train)

#origin model
def calculateGaussian(X,mu,sigma2):
    #X的大小是(10000, 2)
    p = np.ones(X.shape[0])
    for i in range(mu.shape[0]):
        p_temp = 1 / (np.sqrt(2 * np.pi * sigma2[i])) * np.exp(-np.power((X[:,i] - mu[i]), 2)/(2*sigma2[i]))
        #p_temp是(10000, 2)
        p = p * p_temp
    return p



#可视化数据集与等高线
def dispalyData(X,mu,sigma2):
    a = np.linspace(0, 30, 100) #(100,)
    b = np.linspace(0, 30, 100)
    aa, bb = np.meshgrid(a, b) #(100, 100)
    p = calculateGaussian(np.vstack((aa.ravel(), bb.ravel())).T,mu,sigma2)
    p = p.reshape(aa.shape)
    levels = [10**h for h in range(-20,0,3)]
    #levels是一个包含高度值的一维数组,这样python便会画出传入的高度值对应的等高线。
    plt.scatter(X[:,0],X[:,1], marker="x",c="b")
    plt.contour(a,b,p,levels,linewidths=1)



"""dispalyData(x_train,mu_result,sigma2_result)
plt.show()"""

#--------------------------------------------------------------------------------------

#将表示概率的数值转换成正样本或者负样本
def convertP(p,epslon):
    m = len(p)
    p_result = np.zeros(m)
    for i in range(m):
        if p[i] <= epslon:
            p_result[i] = 1
        else:
            p_result[i] = 0
    return p_result

#计算F1
def calculateF1(pval,yval):
    m = len(pval)
    tp = 0
    for i in range(m):
        if yval[i] == 1 and pval[i] == 1:
            tp += 1

    prec = tp/np.sum(pval)
    rec = tp/np.sum(yval)
    F1 = 2*prec*rec/(prec+rec)
    return F1



#选出合适的阈值并返回对应的F1
def selectThreshold(yval,pval):
    """
    在众多eplison中选出F1最小的阈值
    :param yval: 验证集中的实际输出值
    :param pval: 使用异常检测函数得到的预测概率值
    :return: 返回合适的阈值与对应的F1
    """
    m = len(yval)
    F1_result = 0
    for epsilon in np.linspace(min(pval), max(pval), 1000):
    #for epsilon in thresholds:
        p_temp = convertP(pval,epsilon)
        F1_temp = calculateF1(p_temp,yval)
        if F1_temp > F1_result:
            F1_result = F1_temp
            eplison_result = epsilon
    return eplison_result,F1_result

#选择合适的阈值
p_temp = calculateGaussian(x_val,mu_result,sigma2_result)
eplison_result,F1_result = selectThreshold(y_val,p_temp) #8.999852631901393e-05 0.8750000000000001

#找出异常点
p_train = calculateGaussian(x_train,mu_result,sigma2_result)
outliers = np.array([x_train[i] for i in range(len(x_train)) if p_train[i] < eplison_result]) #概率小于阈值记为异常点

plt.scatter(x_train[:,0],x_train[:,1],c="b",marker = "x",s=10)
plt.scatter(outliers[:,0],outliers[:,1],facecolors='none', edgecolors='r', linewidths=2,marker="o")
dispalyData(x_train,mu_result,sigma2_result)
plt.show()

方法二:使用多元变量高斯函数

在这里插入图片描述

在这里插入图片描述

import numpy as np
from matplotlib import pyplot as plt
from scipy import io

#1.读取数据集
dt = io.loadmat("E:\机器学习\吴恩达\data_sets\ex8data1.mat")
x_train = dt["X"] #(307, 2)
x_val = dt["Xval"] #(307, 2)
y_val = dt["yval"] #(307, 1)

#2.可视化数据集
"""plt.scatter(x_train[:,0],x_train[:,1],marker = "x")
plt.show()"""

#3.定义高斯分布函数与异常检测函数

#求矩阵X的平均值mu与协方差矩阵sigma
def estimateGaussian(X):
    m = X.shape[0]
    #求均值mu
    mu = X.mean(axis = 0)
    #求sigma
    sigma = X.var(axis=0) 
    return mu,sigma


mu_result,sigma_result = estimateGaussian(x_train)

#多元变量高斯分布函数
def multiGaussian(X,mu,sigma):
    #X的大小是(307, 2)
    m = X.shape[0]
    n = X.shape[1]
    p = np.zeros(m)
    if np.ndim(sigma) == 1:
        sigma = np.diag(sigma)  # 对角阵
    sigma_temp = np.linalg.det(sigma)
    for i in range(m):
        matrix_temp = (X[i,:]-mu).T @ np.linalg.inv(sigma) @ (X[i,:]-mu) #(2,2)
        p[i] = 1/(np.power((2 * np.pi),(n/2)) * np.power(sigma_temp,0.5)) * np.exp(-0.5*matrix_temp)
    return p



#可视化数据集与等高线
def dispalyData(X,mu,sigma):
    a = np.linspace(0, 30, 100) #(100,)
    b = np.linspace(0, 30, 100)
    aa, bb = np.meshgrid(a, b) #(100, 100)
    p = multiGaussian(np.vstack((aa.ravel(), bb.ravel())).T,mu,sigma)
    p = p.reshape(aa.shape)
    levels = [10**h for h in range(-20,0,3)]
    #levels是一个包含高度值的一维数组,这样python便会画出传入的高度值对应的等高线。
    plt.scatter(X[:,0],X[:,1], marker="x",c="b")
    plt.contour(a,b,p,levels,linewidths=1)




#--------------------------------------------------------------------------------------

#将表示概率的数值转换成正样本或者负样本
def convertP(p,epslon):
    m = len(p)
    p_result = np.zeros(m)
    for i in range(m):
        if p[i] <= epslon:
            p_result[i] = 1
        else:
            p_result[i] = 0
    return p_result

#计算F1
def calculateF1(pval,yval):
    m = len(pval)
    tp = 0
    for i in range(m):
        if yval[i] == 1 and pval[i] == 1:
            tp += 1

    prec = tp/np.sum(pval)
    rec = tp/np.sum(yval)
    F1 = 2*prec*rec/(prec+rec)
    return F1



#选出合适的阈值并返回对应的F1
def selectThreshold(yval,pval):
    """
    在众多eplison中选出F1最小的阈值
    :param yval: 验证集中的实际输出值
    :param pval: 使用异常检测函数得到的预测概率值
    :return: 返回合适的阈值与对应的F1
    """
    m = len(yval)
    F1_result = 0
    for epsilon in np.linspace(min(pval), max(pval), 1001):
    #for epsilon in thresholds:
        p_temp = convertP(pval,epsilon)
        F1_temp = calculateF1(p_temp,yval)
        if F1_temp > F1_result:
            F1_result = F1_temp
            eplison_result = epsilon
    return eplison_result,F1_result


#选择合适的阈值
p_temp = multiGaussian(x_val,mu_result,sigma_result)
eplison_result,F1_result = selectThreshold(y_val,p_temp) #8.999852631901393e-05 0.8750000000000001
print(eplison_result,F1_result)

#找出异常点
p_train = multiGaussian(x_train,mu_result,sigma_result)
outliers = np.array([x_train[i] for i in range(len(x_train)) if p_train[i] < eplison_result]) #概率小于阈值记为异常点

plt.scatter(x_train[:,0],x_train[:,1],c="b",marker = "x",s=10)
plt.scatter(outliers[:,0],outliers[:,1],facecolors='none', edgecolors='r', linewidths=2,marker="o")
dispalyData(x_train,mu_result,sigma_result)
plt.show()

任务二:对大数据集进行异常检测

方法一:使用原始模型

在这里插入图片描述

import numpy as np
from scipy import io

#1.读取数据
dt = io.loadmat("E:\机器学习\吴恩达\data_sets\ex8data2.mat")
x_train = dt["X"] #(1000, 11)
x_val = dt["Xval"] #(100, 11)
y_val = dt["yval"] #(100, 1)

#2.定义异常检测函数

#求矩阵X的平均值mu与方差sigma2
def estimateGaussian(X):
    #求均值mu
    mu = X.mean(axis = 0)
    #求方差sigma2
    sigma2 = X.var(axis = 0)
    return mu,sigma2

#origin model
def calculateGaussian(X,mu,sigma2):
    #X的大小是(10000, 2)
    p = np.ones(X.shape[0])
    for i in range(mu.shape[0]):
        p_temp = 1 / (np.sqrt(2 * np.pi * sigma2[i])) * np.exp(-np.power((X[:,i] - mu[i]), 2)/(2*sigma2[i]))
        #p_temp是(10000, 2)
        p = p * p_temp
    return p

#将表示概率的数值转换成正样本或者负样本
def convertP(p,epslon):
    m = len(p)
    p_result = np.zeros(m)
    for i in range(m):
        if p[i] <= epslon:
            p_result[i] = 1
        else:
            p_result[i] = 0
    return p_result

#计算F1
def calculateF1(pval,yval):
    m = len(pval)
    tp = 0
    for i in range(m):
        if yval[i] == 1 and pval[i] == 1:
            tp += 1

    prec = tp/np.sum(pval)
    rec = tp/np.sum(yval)
    F1 = 2*prec*rec/(prec+rec)
    return F1


#选出合适的阈值并返回对应的F1
def selectThreshold(yval,pval):
    """
    在众多eplison中选出F1最小的阈值
    :param yval: 验证集中的实际输出值
    :param pval: 使用异常检测函数得到的预测概率值
    :return: 返回合适的阈值与对应的F1
    """
    m = len(yval)
    F1_result = 0
    #选择为1000个数时候,1.3786074982000235e-18 0.6153846153846154
    #选择为10000个数时候,1.3773671632195622e-19 0.7368421052631577
    for epsilon in np.linspace(min(pval), max(pval), 1000):
    #for epsilon in thresholds:
        p_temp = convertP(pval,epsilon)
        F1_temp = calculateF1(p_temp,yval)
        if F1_temp > F1_result:
            F1_result = F1_temp
            eplison_result = epsilon
    return eplison_result,F1_result

#3.调用函数得到合适的阈值与对应的F1
#选择合适的阈值
mu_result,sigma2_result = estimateGaussian(x_train)
p_temp = calculateGaussian(x_val,mu_result,sigma2_result)
eplison_result,F1_result = selectThreshold(y_val,p_temp) #8.999852631901393e-05 0.8750000000000001
print(eplison_result,F1_result)

#找出异常点
p_train = calculateGaussian(x_train,mu_result,sigma2_result)
outliers = np.array([x_train[i] for i in range(len(x_train)) if p_train[i] < eplison_result]) #概率小于阈值记为异常点
print(outliers.shape[0])

方法二:使用多元变量高斯函数

在这里插入图片描述

import numpy as np
from scipy import io

#1.读取数据
dt = io.loadmat("E:\机器学习\吴恩达\data_sets\ex8data2.mat")
x_train = dt["X"] #(1000, 11)
x_val = dt["Xval"] #(100, 11)
y_val = dt["yval"] #(100, 1)

#2.定义异常检测函数

#求矩阵X的平均值mu与协方差矩阵sigma
def estimateGaussian(X):
    m = X.shape[0]
    #求均值mu
    mu = X.mean(axis = 0)
    #求sigma
    sigma = X.var(axis=0)
    return mu,sigma


mu_result,sigma_result = estimateGaussian(x_train)

#多元变量高斯分布函数
def multiGaussian(X,mu,sigma):
    #X的大小是(307, 2)
    m = X.shape[0]
    n = X.shape[1]
    p = np.zeros(m)
    if np.ndim(sigma) == 1:
        sigma = np.diag(sigma)  # 对角阵
    sigma_temp = np.linalg.det(sigma)
    for i in range(m):
        matrix_temp = (X[i,:]-mu).T @ np.linalg.inv(sigma) @ (X[i,:]-mu) #(2,2)
        p[i] = 1/(np.power((2 * np.pi),(n/2)) * np.power(sigma_temp,0.5)) * np.exp(-0.5*matrix_temp)
    return p

#将表示概率的数值转换成正样本或者负样本
def convertP(p,epslon):
    m = len(p)
    p_result = np.zeros(m)
    for i in range(m):
        if p[i] <= epslon:
            p_result[i] = 1
        else:
            p_result[i] = 0
    return p_result

#计算F1
def calculateF1(pval,yval):
    m = len(pval)
    tp = 0
    for i in range(m):
        if yval[i] == 1 and pval[i] == 1:
            tp += 1

    prec = tp/np.sum(pval)
    rec = tp/np.sum(yval)
    F1 = 2*prec*rec/(prec+rec)
    return F1


#选出合适的阈值并返回对应的F1
def selectThreshold(yval,pval):
    """
    在众多eplison中选出F1最小的阈值
    :param yval: 验证集中的实际输出值
    :param pval: 使用异常检测函数得到的预测概率值
    :return: 返回合适的阈值与对应的F1
    """
    m = len(yval)
    F1_result = 0
    #选择为1000个数时候,1.3772288907613575e-18 0.6153846153846154 异常点个数为117
    #选择为10000个数时候,1.377367163219563e-19 0.7368421052631577 异常点个数为117
    for epsilon in np.linspace(min(pval), max(pval), 10000):
    #for epsilon in thresholds:
        p_temp = convertP(pval,epsilon)
        F1_temp = calculateF1(p_temp,yval)
        if F1_temp > F1_result:
            F1_result = F1_temp
            eplison_result = epsilon
    return eplison_result,F1_result

#3.调用函数得到合适的阈值与对应的F1
#选择合适的阈值
mu_result,sigma_result = estimateGaussian(x_train)
p_temp = multiGaussian(x_val,mu_result,sigma_result)
eplison_result,F1_result = selectThreshold(y_val,p_temp) #8.999852631901393e-05 0.8750000000000001
print(eplison_result,F1_result)

#找出异常点
p_train = multiGaussian(x_train,mu_result,sigma_result)
outliers = np.array([x_train[i] for i in range(len(x_train)) if p_train[i] < eplison_result]) #概率小于阈值记为异常点
print(outliers.shape[0])
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值