参考链接:吴恩达|机器学习作业8.0.异常检测_学吧学吧终成学霸的博客-CSDN博客
任务一:对小数据集进行异常检测并可视化
任务二:对大数据集进行异常检测
任务一:对小数据集进行异常检测并可视化
numpy矩阵求最值、均值、方差、标准差、中值、求和,众数_王同学加油的博客-CSDN博客_矩阵的均值和方差
np.random.normal()详解_YL_python_C++_java的博客-CSDN博客_np.random.normal
python作图之plt.contour详解_你大佬来啦的博客-CSDN博客_plt.contour
方法一:使用原始模型
第一步:找出均值方差并进行可视化
import numpy as np
from matplotlib import pyplot as plt
from scipy import io
#1.读取数据集
dt = io.loadmat("E:\机器学习\吴恩达\data_sets\ex8data1.mat")
x_train = dt["X"] #(307, 2)
x_val = dt["Xval"] #(307, 2)
y_val = dt["yval"] #(307, 1)
#2.可视化数据集
"""plt.scatter(x_train[:,0],x_train[:,1],marker = "x")
plt.show()"""
#3.定义高斯分布函数与异常检测函数
#求矩阵X的平均值mu与方差sigma2
def estimateGaussian(X):
#求均值mu
mu = X.mean(axis = 0)
#求方差sigma2
sigma2 = X.var(axis = 0)
return mu,sigma2
mu_result,sigma2_result = estimateGaussian(x_train)
#origin model
def calculateGaussian(X,mu,sigma2):
#X的大小是(10000, 2)
p = np.ones(X.shape[0])
for i in range(mu.shape[0]):
p_temp = 1 / (np.sqrt(2 * np.pi * sigma2[i])) * np.exp(-np.power((X[:,i] - mu[i]), 2)/(2*sigma2[i]))
#p_temp是(10000, 2)
p = p * p_temp
return p
#4.可视化数据集与等高线
def dispalyData(X,mu,sigma2):
a = np.linspace(0, 30, 100) #(100,)
b = np.linspace(0, 30, 100)
aa, bb = np.meshgrid(a, b) #(100, 100)
p = calculateGaussian(np.vstack((aa.ravel(), bb.ravel())).T,mu,sigma2)
p = p.reshape(aa.shape)
levels = [10**h for h in range(-20,0,3)]
#levels是一个包含高度值的一维数组,这样python便会画出传入的高度值对应的等高线。
plt.scatter(X[:,0],X[:,1], marker="x")
plt.contour(a,b,p,levels)
dispalyData(x_train,mu_result,sigma2_result)
plt.show()
第二步:根据验证集选择合适的阈值并找出异常点
import numpy as np
from matplotlib import pyplot as plt
from scipy import io
#1.读取数据集
dt = io.loadmat("E:\机器学习\吴恩达\data_sets\ex8data1.mat")
x_train = dt["X"] #(307, 2)
x_val = dt["Xval"] #(307, 2)
y_val = dt["yval"] #(307, 1)
#2.可视化数据集
"""plt.scatter(x_train[:,0],x_train[:,1],marker = "x")
plt.show()"""
#3.定义高斯分布函数与异常检测函数
#求矩阵X的平均值mu与方差sigma2
def estimateGaussian(X):
#求均值mu
mu = X.mean(axis = 0)
#求方差sigma2
sigma2 = X.var(axis = 0)
return mu,sigma2
mu_result,sigma2_result = estimateGaussian(x_train)
#origin model
def calculateGaussian(X,mu,sigma2):
#X的大小是(10000, 2)
p = np.ones(X.shape[0])
for i in range(mu.shape[0]):
p_temp = 1 / (np.sqrt(2 * np.pi * sigma2[i])) * np.exp(-np.power((X[:,i] - mu[i]), 2)/(2*sigma2[i]))
#p_temp是(10000, 2)
p = p * p_temp
return p
#可视化数据集与等高线
def dispalyData(X,mu,sigma2):
a = np.linspace(0, 30, 100) #(100,)
b = np.linspace(0, 30, 100)
aa, bb = np.meshgrid(a, b) #(100, 100)
p = calculateGaussian(np.vstack((aa.ravel(), bb.ravel())).T,mu,sigma2)
p = p.reshape(aa.shape)
levels = [10**h for h in range(-20,0,3)]
#levels是一个包含高度值的一维数组,这样python便会画出传入的高度值对应的等高线。
plt.scatter(X[:,0],X[:,1], marker="x",c="b")
plt.contour(a,b,p,levels,linewidths=1)
"""dispalyData(x_train,mu_result,sigma2_result)
plt.show()"""
#--------------------------------------------------------------------------------------
#将表示概率的数值转换成正样本或者负样本
def convertP(p,epslon):
m = len(p)
p_result = np.zeros(m)
for i in range(m):
if p[i] <= epslon:
p_result[i] = 1
else:
p_result[i] = 0
return p_result
#计算F1
def calculateF1(pval,yval):
m = len(pval)
tp = 0
for i in range(m):
if yval[i] == 1 and pval[i] == 1:
tp += 1
prec = tp/np.sum(pval)
rec = tp/np.sum(yval)
F1 = 2*prec*rec/(prec+rec)
return F1
#选出合适的阈值并返回对应的F1
def selectThreshold(yval,pval):
"""
在众多eplison中选出F1最小的阈值
:param yval: 验证集中的实际输出值
:param pval: 使用异常检测函数得到的预测概率值
:return: 返回合适的阈值与对应的F1
"""
m = len(yval)
F1_result = 0
for epsilon in np.linspace(min(pval), max(pval), 1000):
#for epsilon in thresholds:
p_temp = convertP(pval,epsilon)
F1_temp = calculateF1(p_temp,yval)
if F1_temp > F1_result:
F1_result = F1_temp
eplison_result = epsilon
return eplison_result,F1_result
#选择合适的阈值
p_temp = calculateGaussian(x_val,mu_result,sigma2_result)
eplison_result,F1_result = selectThreshold(y_val,p_temp) #8.999852631901393e-05 0.8750000000000001
#找出异常点
p_train = calculateGaussian(x_train,mu_result,sigma2_result)
outliers = np.array([x_train[i] for i in range(len(x_train)) if p_train[i] < eplison_result]) #概率小于阈值记为异常点
plt.scatter(x_train[:,0],x_train[:,1],c="b",marker = "x",s=10)
plt.scatter(outliers[:,0],outliers[:,1],facecolors='none', edgecolors='r', linewidths=2,marker="o")
dispalyData(x_train,mu_result,sigma2_result)
plt.show()
方法二:使用多元变量高斯函数
import numpy as np
from matplotlib import pyplot as plt
from scipy import io
#1.读取数据集
dt = io.loadmat("E:\机器学习\吴恩达\data_sets\ex8data1.mat")
x_train = dt["X"] #(307, 2)
x_val = dt["Xval"] #(307, 2)
y_val = dt["yval"] #(307, 1)
#2.可视化数据集
"""plt.scatter(x_train[:,0],x_train[:,1],marker = "x")
plt.show()"""
#3.定义高斯分布函数与异常检测函数
#求矩阵X的平均值mu与协方差矩阵sigma
def estimateGaussian(X):
m = X.shape[0]
#求均值mu
mu = X.mean(axis = 0)
#求sigma
sigma = X.var(axis=0)
return mu,sigma
mu_result,sigma_result = estimateGaussian(x_train)
#多元变量高斯分布函数
def multiGaussian(X,mu,sigma):
#X的大小是(307, 2)
m = X.shape[0]
n = X.shape[1]
p = np.zeros(m)
if np.ndim(sigma) == 1:
sigma = np.diag(sigma) # 对角阵
sigma_temp = np.linalg.det(sigma)
for i in range(m):
matrix_temp = (X[i,:]-mu).T @ np.linalg.inv(sigma) @ (X[i,:]-mu) #(2,2)
p[i] = 1/(np.power((2 * np.pi),(n/2)) * np.power(sigma_temp,0.5)) * np.exp(-0.5*matrix_temp)
return p
#可视化数据集与等高线
def dispalyData(X,mu,sigma):
a = np.linspace(0, 30, 100) #(100,)
b = np.linspace(0, 30, 100)
aa, bb = np.meshgrid(a, b) #(100, 100)
p = multiGaussian(np.vstack((aa.ravel(), bb.ravel())).T,mu,sigma)
p = p.reshape(aa.shape)
levels = [10**h for h in range(-20,0,3)]
#levels是一个包含高度值的一维数组,这样python便会画出传入的高度值对应的等高线。
plt.scatter(X[:,0],X[:,1], marker="x",c="b")
plt.contour(a,b,p,levels,linewidths=1)
#--------------------------------------------------------------------------------------
#将表示概率的数值转换成正样本或者负样本
def convertP(p,epslon):
m = len(p)
p_result = np.zeros(m)
for i in range(m):
if p[i] <= epslon:
p_result[i] = 1
else:
p_result[i] = 0
return p_result
#计算F1
def calculateF1(pval,yval):
m = len(pval)
tp = 0
for i in range(m):
if yval[i] == 1 and pval[i] == 1:
tp += 1
prec = tp/np.sum(pval)
rec = tp/np.sum(yval)
F1 = 2*prec*rec/(prec+rec)
return F1
#选出合适的阈值并返回对应的F1
def selectThreshold(yval,pval):
"""
在众多eplison中选出F1最小的阈值
:param yval: 验证集中的实际输出值
:param pval: 使用异常检测函数得到的预测概率值
:return: 返回合适的阈值与对应的F1
"""
m = len(yval)
F1_result = 0
for epsilon in np.linspace(min(pval), max(pval), 1001):
#for epsilon in thresholds:
p_temp = convertP(pval,epsilon)
F1_temp = calculateF1(p_temp,yval)
if F1_temp > F1_result:
F1_result = F1_temp
eplison_result = epsilon
return eplison_result,F1_result
#选择合适的阈值
p_temp = multiGaussian(x_val,mu_result,sigma_result)
eplison_result,F1_result = selectThreshold(y_val,p_temp) #8.999852631901393e-05 0.8750000000000001
print(eplison_result,F1_result)
#找出异常点
p_train = multiGaussian(x_train,mu_result,sigma_result)
outliers = np.array([x_train[i] for i in range(len(x_train)) if p_train[i] < eplison_result]) #概率小于阈值记为异常点
plt.scatter(x_train[:,0],x_train[:,1],c="b",marker = "x",s=10)
plt.scatter(outliers[:,0],outliers[:,1],facecolors='none', edgecolors='r', linewidths=2,marker="o")
dispalyData(x_train,mu_result,sigma_result)
plt.show()
任务二:对大数据集进行异常检测
方法一:使用原始模型
import numpy as np
from scipy import io
#1.读取数据
dt = io.loadmat("E:\机器学习\吴恩达\data_sets\ex8data2.mat")
x_train = dt["X"] #(1000, 11)
x_val = dt["Xval"] #(100, 11)
y_val = dt["yval"] #(100, 1)
#2.定义异常检测函数
#求矩阵X的平均值mu与方差sigma2
def estimateGaussian(X):
#求均值mu
mu = X.mean(axis = 0)
#求方差sigma2
sigma2 = X.var(axis = 0)
return mu,sigma2
#origin model
def calculateGaussian(X,mu,sigma2):
#X的大小是(10000, 2)
p = np.ones(X.shape[0])
for i in range(mu.shape[0]):
p_temp = 1 / (np.sqrt(2 * np.pi * sigma2[i])) * np.exp(-np.power((X[:,i] - mu[i]), 2)/(2*sigma2[i]))
#p_temp是(10000, 2)
p = p * p_temp
return p
#将表示概率的数值转换成正样本或者负样本
def convertP(p,epslon):
m = len(p)
p_result = np.zeros(m)
for i in range(m):
if p[i] <= epslon:
p_result[i] = 1
else:
p_result[i] = 0
return p_result
#计算F1
def calculateF1(pval,yval):
m = len(pval)
tp = 0
for i in range(m):
if yval[i] == 1 and pval[i] == 1:
tp += 1
prec = tp/np.sum(pval)
rec = tp/np.sum(yval)
F1 = 2*prec*rec/(prec+rec)
return F1
#选出合适的阈值并返回对应的F1
def selectThreshold(yval,pval):
"""
在众多eplison中选出F1最小的阈值
:param yval: 验证集中的实际输出值
:param pval: 使用异常检测函数得到的预测概率值
:return: 返回合适的阈值与对应的F1
"""
m = len(yval)
F1_result = 0
#选择为1000个数时候,1.3786074982000235e-18 0.6153846153846154
#选择为10000个数时候,1.3773671632195622e-19 0.7368421052631577
for epsilon in np.linspace(min(pval), max(pval), 1000):
#for epsilon in thresholds:
p_temp = convertP(pval,epsilon)
F1_temp = calculateF1(p_temp,yval)
if F1_temp > F1_result:
F1_result = F1_temp
eplison_result = epsilon
return eplison_result,F1_result
#3.调用函数得到合适的阈值与对应的F1
#选择合适的阈值
mu_result,sigma2_result = estimateGaussian(x_train)
p_temp = calculateGaussian(x_val,mu_result,sigma2_result)
eplison_result,F1_result = selectThreshold(y_val,p_temp) #8.999852631901393e-05 0.8750000000000001
print(eplison_result,F1_result)
#找出异常点
p_train = calculateGaussian(x_train,mu_result,sigma2_result)
outliers = np.array([x_train[i] for i in range(len(x_train)) if p_train[i] < eplison_result]) #概率小于阈值记为异常点
print(outliers.shape[0])
方法二:使用多元变量高斯函数
import numpy as np
from scipy import io
#1.读取数据
dt = io.loadmat("E:\机器学习\吴恩达\data_sets\ex8data2.mat")
x_train = dt["X"] #(1000, 11)
x_val = dt["Xval"] #(100, 11)
y_val = dt["yval"] #(100, 1)
#2.定义异常检测函数
#求矩阵X的平均值mu与协方差矩阵sigma
def estimateGaussian(X):
m = X.shape[0]
#求均值mu
mu = X.mean(axis = 0)
#求sigma
sigma = X.var(axis=0)
return mu,sigma
mu_result,sigma_result = estimateGaussian(x_train)
#多元变量高斯分布函数
def multiGaussian(X,mu,sigma):
#X的大小是(307, 2)
m = X.shape[0]
n = X.shape[1]
p = np.zeros(m)
if np.ndim(sigma) == 1:
sigma = np.diag(sigma) # 对角阵
sigma_temp = np.linalg.det(sigma)
for i in range(m):
matrix_temp = (X[i,:]-mu).T @ np.linalg.inv(sigma) @ (X[i,:]-mu) #(2,2)
p[i] = 1/(np.power((2 * np.pi),(n/2)) * np.power(sigma_temp,0.5)) * np.exp(-0.5*matrix_temp)
return p
#将表示概率的数值转换成正样本或者负样本
def convertP(p,epslon):
m = len(p)
p_result = np.zeros(m)
for i in range(m):
if p[i] <= epslon:
p_result[i] = 1
else:
p_result[i] = 0
return p_result
#计算F1
def calculateF1(pval,yval):
m = len(pval)
tp = 0
for i in range(m):
if yval[i] == 1 and pval[i] == 1:
tp += 1
prec = tp/np.sum(pval)
rec = tp/np.sum(yval)
F1 = 2*prec*rec/(prec+rec)
return F1
#选出合适的阈值并返回对应的F1
def selectThreshold(yval,pval):
"""
在众多eplison中选出F1最小的阈值
:param yval: 验证集中的实际输出值
:param pval: 使用异常检测函数得到的预测概率值
:return: 返回合适的阈值与对应的F1
"""
m = len(yval)
F1_result = 0
#选择为1000个数时候,1.3772288907613575e-18 0.6153846153846154 异常点个数为117
#选择为10000个数时候,1.377367163219563e-19 0.7368421052631577 异常点个数为117
for epsilon in np.linspace(min(pval), max(pval), 10000):
#for epsilon in thresholds:
p_temp = convertP(pval,epsilon)
F1_temp = calculateF1(p_temp,yval)
if F1_temp > F1_result:
F1_result = F1_temp
eplison_result = epsilon
return eplison_result,F1_result
#3.调用函数得到合适的阈值与对应的F1
#选择合适的阈值
mu_result,sigma_result = estimateGaussian(x_train)
p_temp = multiGaussian(x_val,mu_result,sigma_result)
eplison_result,F1_result = selectThreshold(y_val,p_temp) #8.999852631901393e-05 0.8750000000000001
print(eplison_result,F1_result)
#找出异常点
p_train = multiGaussian(x_train,mu_result,sigma_result)
outliers = np.array([x_train[i] for i in range(len(x_train)) if p_train[i] < eplison_result]) #概率小于阈值记为异常点
print(outliers.shape[0])