这个项目包含了异常检测和推荐系统,使用高斯模型实现异常检测算法,并将其应用于检测网络上的故障服务器。 我们还将看到如何使用协作过滤构建推荐系统,并将其应用于电影推荐数据集。
1 异常检测
1.1 读取数据和数据可视化
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.io import loadmat
data=loadmat(r'C:\Users\xxx\Desktop\机器学习\machine-learning-ex8\machine-learning-ex8\ex8\ex8data1.mat')
X=data['X']
X.shape
(307, 2)
fig,ax=plt.subplots(figsize=(12,8))
ax.scatter(X[:,0],X[:,1])
plt.show()
1.2 定义高斯分布
计算高斯分布参数
def estimate_gaussian(X):
mu=X.mean(axis=0)#0代表最以列为计算单位
sigma=X.var(axis=0)
return mu,sigma
mu,sigma=estimate_gaussian(X)
mu,sigma
(array([14.11222578, 14.99771051]), array([1.83263141, 1.70974533]))
以等高线展示数据分布
xplot=np.linspace(0,25,100)
yplot=np.linspace(0,25,100)
Xplot,Yplot=np.meshgrid(xplot,yplot) #参数1,2匹配作为坐标
Z=np.exp((-0.5)*((Xplot-mu[0])**2/sigma[0]+(Yplot-mu[1])**2/sigma[1]))
fig,ax=plt.subplots(figsize=(12,8))
contour=plt.contour(Xplot,Yplot,Z,[10**-11,10**-7,10**-5,10**-3,0.1],colors='k')
ax.scatter(X[:,0],X[:,1])
plt.show()
#contour
# 这个函数主要对网格中每个点的值等于一系列值的时候做出一条条轮廓线,类似于等高线 。
# Z : array-like(N,M)
# 绘制轮廓的高度值。
# levels: int或类似数组,可选
# 确定轮廓线/区域的数量和位置。
# 如果int Ñ,使用Ñ数据间隔; 即绘制n + 1个等高线。水平高度自动选择。
# 如果是数组,则在指定的级别绘制轮廓线。值必须按递增顺序排列。
1.3选择阈值ε
Xval=data['Xval']
yval=data['yval']
Xval.shape,yval.shape
((307, 2), (307, 1))
from scipy import stats
dist=stats.norm(mu[0],sigma[0])
dist.pdf(15)
0.1935875044615038
将数组传递给概率密度函数,并获得数据集中每个点的概率密度
#scipy.stats.norm函数 可以实现正态分布(也就是高斯分布)
# pdf : 概率密度函数 就是高斯分布
dist.pdf(X[:,0])[0:50]
array([0.183842 , 0.20221694, 0.21746136, 0.19778763, 0.20858956,
0.21652359, 0.16991291, 0.15123542, 0.1163989 , 0.1594734 ,
0.21716057, 0.21760472, 0.20141857, 0.20157497, 0.21711385,
0.21758775, 0.21695576, 0.2138258 , 0.21057069, 0.1173018 ,
0.20765108, 0.21717452, 0.19510663, 0.21702152, 0.17429399,
0.15413455, 0.21000109, 0.20223586, 0.21031898, 0.21313426,
0.16158946, 0.2170794 , 0.17825767, 0.17414633, 0.1264951 ,
0.19723662, 0.14538809, 0.21766361, 0.21191386, 0.21729442,
0.21238912, 0.18799417, 0.21259798, 0.21752767, 0.20616968,
0.21520366, 0.1280081 , 0.21768113, 0.21539967, 0.16913173])
p = np.zeros((X.shape[0], X.shape[1]))
p[:,0] = stats.norm(mu[0], sigma[0]).pdf(X[:,0])
p[:,1] = stats.norm(mu[1], sigma[1]).pdf(X[:,1])
p.shape
(307, 2)
pval = np.zeros((Xval.shape[0], Xval.shape[1]))
pval[:,0] = stats.norm(mu[0], sigma[0]).pdf(Xval[:,0])
pval[:,1] = stats.norm(mu[1], sigma[1]).pdf(Xval[:,1])
pval.shape
# # 第一种调用方式
# gauss = norm(loc=1, scale=2) # loc: mean 均值, scale: standard deviation 标准差
# r_1 = gauss.pdf(X)
# # 第二种调用方式
# r_2 = norm.pdf(X, loc=0, scale=2)
(307, 2)
计算F1分数
def select_threshold(pval,yval):
best_parameter=0
best_f1=0
f1=0
step=(pval.max()-pval.min())/1000
for parameter in np.arange(pval.min(),pval.max(),step):
preds=pval<parameter
tp=np.sum(np.logical_and(preds==1,yval==1)).astype(float)
fp=np.sum(np.logical_and(preds==1,yval==0)).astype(float)
fn=np.sum(np.logical_and(preds==0,yval==1)).astype(float)
precision=tp/(tp+fp)
recall=tp/(tp+fn)
f1=(2*precision*recall)/(precision+recall)
if f1>best_f1:
best_f1=f1
best_parameter=parameter
return best_parameter,best_f1
parameter,f1=select_threshold(pval,yval)
parameter,f1
(0.009566706005956842, 0.7142857142857143)
1.4 将阈值应用于数据集,并可视化结果
outliers = np.where(p < parameter)
outliers
(array([300, 301, 301, 303, 303, 304, 306, 306], dtype=int64),
array([1, 0, 1, 0, 1, 0, 0, 1], dtype=int64))
fig,ax=plt.subplots(figsize=(12,8))
ax.scatter(X[:,0],X[:,1])
ax.scatter(X[outliers[0],0],X[outliers[0],1],s=100,color='r',marker='x')
plt.show()
红点是被标记为异常值的点。 这些看起来很合理。 有一些分离(但没有被标记)的右上角也可能是一个异常值,但是相当接近
2 协同过滤
推荐引擎使用基于项目和用户的相似性度量来检查用户的历史偏好,以便为用户可能感兴趣的新“事物”提供建议。在本练习中,我们将实现一种称为协作过滤的特定推荐系统算法,并将其应用于 电影评分的数据集
2.1 读取数据以及可视化
data=loadmat(r'C:\Users\xxx\Desktop\机器学习\machine-learning-ex8\machine-learning-ex8\ex8\ex8_movies.mat')
data
{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Thu Dec 1 17:19:26 2011',
'__version__': '1.0',
'__globals__': [],
'Y': array([[5, 4, 0, ..., 5, 0, 0],
[3, 0, 0, ..., 0, 0, 5],
[4, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8),
'R': array([[1, 1, 0, ..., 1, 0, 0],
[1, 0, 0, ..., 0, 0, 1],
[1, 0, 0, ..., 0, 0, 0],
...,
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], dtype=uint8)}
Y是包含从1到5的等级的(数量的电影x数量的用户)数组.R是包含指示用户是否给电影评分的二进制值的“指示符”数组。 两者应该具有相同的维度。
Y=data['Y']
R=data['R']
Y.shape,R.shape
((1682, 943), (1682, 943))
Y[1,np.where(R[1,:]==1)[0]].mean() #第一部电影的平均分
3.2061068702290076
fig,ax=plt.subplots(figsize=(12,8))
ax.imshow(Y)
ax.set_xlabel('Users')
ax.set_ylabel('Movies')
fig.tight_layout()
plt.show()
2.2 定义模型所需函数
代价函数
序列化,逆序列化,代价函数
def serialize(X,theta):
"""序列化两个矩阵
"""
# X (movie, feature), (1682, 10): movie features
# theta (user, feature), (943, 10): user preference
return np.concatenate((X.ravel(),theta.ravel()))
def deserialize(param,n_movies,n_users,n_features):
return param[:n_movies*n_features].reshape(n_movies,n_features),param[n_movies*n_features:].reshape(n_users,n_features)
def cost(param,Y,R,n_features):
n_movies,n_users=Y.shape
X,theta=deserialize(param,n_movies,n_users,n_features)
inner=np.multiply(X@theta.T-Y,R)
return np.power(inner,2).sum()/2
2.3 训练模型
读取训练所需数据
params_data=loadmat(r'C:\Users\xxx\Desktop\机器学习\machine-learning-ex8\machine-learning-ex8\ex8\ex8_movieParams.mat')
X=params_data['X']
theta=params_data['Theta']
X.shape,theta.shape
((1682, 10), (943, 10))
参数初始化
users=4
movies=5
features=3
X_sub=X[:movies,:features]
theta_sub=theta[:users,:features]
Y_sub=Y[:movies,:users]
R_sub=R[:movies,:users]
param_sub=serialize(X_sub,theta_sub)
cost(param_sub,Y_sub,R_sub,features)
22.224603725685675
param=serialize(X,theta)
cost(serialize(X,theta),Y,R,10)
27918.64012454421
梯度下降函数
def gradient(param,Y,R,n_features):
n_movies,n_users=Y.shape
X,theta=deserialize(param,n_movies,n_users,n_features)
inner=np.multiply(X@theta.T-Y,R)
X_grad=inner@theta
theta_grad=inner.T@X
return serialize(X_grad,theta_grad)
n_movies,n_users=Y.shape
X_grad,theta_grad=deserialize(gradient(param,Y,R,10),n_movies,n_users,10)
X_grad, theta_grad
(array([[-6.26184144, 2.45936046, -6.87560329, ..., -4.81611896,
3.84341521, -1.88786696],
[-3.80931446, 1.80494255, -2.63877955, ..., -3.55580057,
2.1709485 , 2.65129032],
[-3.13090116, 2.54853961, 0.23884578, ..., -4.18778519,
3.10538294, 5.47323609],
...,
[-1.04774171, 0.99220776, -0.48920899, ..., -0.75342146,
0.32607323, -0.89053637],
[-0.7842118 , 0.76136861, -1.25614442, ..., -1.05047808,
1.63905435, -0.14891962],
[-0.38792295, 1.06425941, -0.34347065, ..., -2.04912884,
1.37598855, 0.19551671]]),
array([[-1.54728877, 9.0812347 , -0.6421836 , ..., -3.92035321,
5.66418748, 1.16465605],
[-2.58829914, 2.52342335, -1.52402705, ..., -5.46793491,
5.82479897, 1.8849854 ],
[ 2.14588899, 2.00889578, -4.32190712, ..., -6.83365682,
1.78952063, 0.82886788],
...,
[-4.59816821, 3.63958389, -2.52909095, ..., -3.50886008,
2.99859566, 0.64932177],
[-4.39652679, 0.55036362, -1.98451805, ..., -6.74723702,
3.8538775 , 3.94901737],
[-3.75193726, 1.44393885, -5.6925333 , ..., -6.56073746,
5.20459188, 2.65003952]]))
有正则化的代价函数
def regularized_cost(param,Y,R,n_features,l=1):
reg_term=np.power(param,2).sum()*(l/2)
return cost(param,Y,R,n_features)+reg_term
# def regularized_gradient(param,Y,R,n_features,l=1):
# grad=gradient(param,Y,R,n_features)
# reg_term=l*param
# return grad+reg_term
def regularized_gradient(param, Y, R, n_features, l=1):
grad = gradient(param, Y, R, n_features)
reg_term = l * param
return grad + reg_term
regularized_cost(param_sub,Y_sub,R_sub,features,l=1.5)
31.34405624427422
regularized_cost(param,Y,R,10,l=1)
32520.682450229557
n_movies,n_users=Y.shape
X_grad,Y_grad=deserialize(regularized_gradient(param,Y,R,10),
n_movies,n_users,10)
X_grad,Y_grad
(array([[-5.21315594, 2.0591285 , -5.68148384, ..., -3.95439796,
3.14612528, -1.59912133],
[-3.02846323, 1.41931663, -2.11758176, ..., -2.85177984,
1.68511329, 2.08666626],
[-2.4893923 , 2.00068576, 0.1550494 , ..., -3.34923876,
2.41055086, 4.33843977],
...,
[-0.82821934, 0.7917289 , -0.39662935, ..., -0.60746963,
0.28294163, -0.71223186],
[-0.62377152, 0.60121466, -1.02043496, ..., -0.84313998,
1.30670669, -0.10603832],
[-0.31115177, 0.86705203, -0.2616062 , ..., -1.64900127,
1.08850949, 0.16318173]]),
array([[-1.26184516, 7.39696961, -0.37924484, ..., -3.15312086,
4.55958584, 0.91278897],
[-2.08328593, 2.06877489, -1.20656461, ..., -4.37487155,
4.62450461, 1.49336864],
[ 1.71397243, 1.53009129, -3.47519602, ..., -5.47031706,
1.46428521, 0.63418576],
...,
[-3.53947561, 2.83086629, -1.95973324, ..., -2.70464586,
2.25512788, 0.52946291],
[-3.50593747, 0.42141628, -1.62891338, ..., -5.37296895,
3.1012226 , 3.13766426],
[-2.92779591, 1.05501291, -4.62312829, ..., -5.27650042,
4.22109195, 2.11819114]]))
3 预测自己的电影评分
3.1 读取数据
movie_list=[]
f=open(r'C:\Users\xxx\Desktop\机器学习\machine-learning-ex8\machine-learning-ex8\ex8\movie_ids.txt',encoding='gb18030',errors='ignore')
for line in f:
tokens=line.strip().split(' ')#strip()移除字符串头尾指定的字符
movie_list.append(' '.join(tokens[1:]))
#join是字符串操作函数,操作的也是字符串,其作用结合字符串使用,常常用于字符连接操作
movie_list=np.array(movie_list)
movie_list[0]
'Toy Story (1995)'
3.2 初始化自己对电影的评分以及数据预处理
ratings=np.zeros((1682,1))
ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5
ratings.shape
(1682, 1)
Y=data['Y']
Y=np.append(ratings,Y,axis=1)
print(Y[:,[0]])
[[4.]
[0.]
[0.]
...
[0.]
[0.]
[0.]]
R=data['R']
R=np.append(ratings!=0,R,axis=1)
R.shape
(1682, 944)
movies=Y.shape[0]
users=Y.shape[1]
features=10
learning_rate=10
X=np.random.random(size=(movies,features))
theta=np.random.random(size=(users,features))
params=serialize(X,theta)
X.shape,theta.shape,params.shape
((1682, 10), (944, 10), (26260,))
这样是为了避免一个用户对所有电影都未评分,导致其特征变量全为0,相当于填一个均值。
Y_norm = Y - Y.mean(axis=1,keepdims=True)
Y_norm.mean()
4.600291296941337e-18
3.3 模型训练以及预测
from scipy.optimize import minimize
fmin = minimize(fun=regularized_cost, x0=params, args=(Y_norm, R, features, learning_rate),
method='TNC', jac=regularized_gradient)
fmin
fun: 68305.08381534365
jac: array([ 1.66782668e-06, 3.49600322e-05, -1.32693716e-05, ...,
7.40374323e-07, 3.75073592e-08, 1.66390760e-06])
message: 'Converged (|f_n-f_(n-1)| ~= 0)'
nfev: 1218
nit: 46
status: 1
success: True
x: array([-0.04078884, 0.11413321, -0.13640777, ..., 0.49293267,
0.16574887, -0.18024391])
X_trained,theta_trained=deserialize(fmin.x,movies,users,features)
X_trained.shape,theta_trained.shape
((1682, 10), (944, 10))
prediction=X_trained@theta_trained.T
my_preds=prediction[:,0]+Y.mean()
idx=np.argsort(my_preds)[::-1]
idx.shape
(1682,)
my_preds[idx][:10]
array([3.2540253 , 3.08521411, 3.08039095, 3.02269888, 2.99702803,
2.98084317, 2.95237292, 2.94640996, 2.93994341, 2.93699705])
因随机初始化向量可能不同,导致预测结果可能稍微有一些不同
for m in movie_list[idx][:10]:
print(m)
As Good As It Gets (1997)
Good Will Hunting (1997)
Wrong Trousers, The (1993)
Rainmaker, The (1997)
Amistad (1997)
Great Escape, The (1963)
Apt Pupil (1998)
Some Folks Call It a Sling Blade (1993)
Wallace & Gromit: The Best of Aardman Animation (1996)
Close Shave, A (1995)
总结
- plt.contour 这个函数主要对网格中每个点的值等于一系列值的时候做出一条条轮廓线,类似于等高线;
- scipy.stats.norm函数 可以实现正态分布(也就是高斯分布);
- join是字符串操作函数,操作的也是字符串,其作用结合字符串使用,常常用于字符连接操作。