吴恩达《机器学习》ex8

简介

本文是吴恩达《机器学习》习题八的python解答,包含异常检测和推荐系统。

异常检测代码

我们的第一个任务是使用高斯模型来检测数据集中未标记的示例是否应被视为异常。 我们先从简单的二维数据集开始。

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.io as sio

data = sio.loadmat('../data_sets/ex8data1.mat')
X1 = data['X']
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1[:,0],X1[:,1])
plt.show()

在这里插入图片描述

# check distribution of each feature
plt.hist(X1[:,0],bins=50)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution of x')
plt.show()

在这里插入图片描述

plt.hist(X1[:,1],bins=50)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution of y')
plt.show()

在这里插入图片描述
可以看到两个维度的分布,大致满足正态分布。如果不满足的话,可以对数据进行变换,尽量拟合正态分布。

def compute_gaussian_separately(X):
    mu = np.mean(X,axis=0)
    sigma = np.var(X,axis=0)
    return mu,sigma

mu1, sigma1 = compute_gaussian_separately(X1)
mu1,sigma1
# (array([14.11222578, 14.99771051]), array([1.83263141, 1.70974533]))

#  probability
def compute_res(x,y,mu,sigma):
    return np.exp((-0.5)*(x-mu[0])**2/sigma[0] + (-0.5)*(y-mu[1])**2/sigma[1])

xplot = np.linspace(0,25,100)
yplot = np.linspace(0,25,100)
Xplot, Yplot = np.meshgrid(xplot,yplot)
Z = compute_res(Xplot,Yplot,mu1,sigma1)

fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1[:,0],X1[:,1])
ax.contour(Xplot,Yplot,Z,[10**-11,10**-7,10**-5,10**-3,10**-2,0.1,0.5,0.9])
plt.show()

在这里插入图片描述

from scipy import stats

X1val = data['Xval']
y1val = data['yval']
pval = np.zeros((X1val.shape[0],X1val.shape[1]))
pval[:,0] = stats.norm(mu1[0],sigma1[0]).pdf(X1val[:,0])
pval[:,1] = stats.norm(mu1[1],sigma1[1]).pdf(X1val[:,1])

def find_best_delta(p,y):
    recall = 0
    best_f1 = 0
    best_delta = 0
    final_presicion = 0
    final_recall = 0
    step_size = p.max() / 10000
    for delta in np.arange(0, p.max(), step_size):
        y_pred = (p<delta).astype(int)
        y_pred = y_pred[:,0] + y_pred[:,1]
        y_pred = y_pred.reshape(len(y_pred),1)
        tp = np.sum((y==1)&(y_pred > 0))
        fp = np.sum((y==0)&(y_pred > 0))
        fn = np.sum((y==1)&(y_pred == 0))
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        f1 = 2*precision*recall/(precision+recall)
        if f1 > best_f1:
            best_f1 = f1
            best_delta = delta
            final_presicion = precision
            final_recall = recall
    return best_delta,best_f1,final_presicion,final_recall

def find_best_delta_with_arr(p,y,arr):
    recall = 0
    best_f1 = 0
    best_delta = 0
    final_presicion = 0
    final_recall = 0
    for delta in arr:
        y_pred = (p<delta).astype(int)
        y_pred = y_pred.reshape(len(y_pred),1)
        y = y.reshape(len(y),1)
        tp = np.sum((y==1)&(y_pred==1))
        fp = np.sum((y==0)&(y_pred==1))
        fn = np.sum((y==1)&(y_pred==0))
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        f1 = 2*precision*recall/(precision+recall)
        if f1 > best_f1:
            best_f1 = f1
            best_delta = delta
            final_presicion = precision
            final_recall = recall
    return best_delta,best_f1,final_presicion,final_recall

delta,f1,precision,recall = find_best_delta(pval,y1val)
delta,f1,precision,recall
# (0.00590335760854941, 0.8750000000000001, 1.0, 0.7777777777777778)

# plot result on validate dataset
val_pred = (pval<delta).astype(int)
val_pred = val_pred[:,0] + val_pred[:,1]
val_pred = val_pred.reshape(len(val_pred),1)
outliers = np.where(val_pred > 0)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1val[:,0],X1val[:,1])
ax.scatter(X1val[outliers[0],0],X1val[outliers[0],1],c='y')  
miss = np.where((y1val==1)&(val_pred==0))
ax.scatter(X1val[miss[0],0],X1val[miss[0],1],s=50,marker='o',facecolors='none',edgecolors='r')
ax.legend(['Normal','Anomaly','Missed Anomaly'])
plt.show()

在这里插入图片描述

# plot result on train dataset
p = np.zeros((X1.shape[0],X1.shape[1]))
p[:,0] = stats.norm(mu1[0],sigma1[0]).pdf(X1[:,0])
p[:,1] = stats.norm(mu1[1],sigma1[1]).pdf(X1[:,1])
p = (p < delta).astype(int)
p = p[:,0] + p[:,1]
outliers = np.where(p > 0)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1[:,0],X1[:,1])
ax.scatter(X1[outliers[0],0],X1[outliers[0],1],s=50,marker='o',facecolors='none',edgecolors='r')  
ax.legend(['Normal','Anomaly'])
plt.show()

在这里插入图片描述
上文我们是一个特征一个特征的去判断,当出现一个特征的值出现的概率低与阈值我们就认为是异常值。接下来我们整体一起考虑。

mu = np.mean(X1,axis=0)
cov = np.cov(X1.T)
cov
# array([[ 1.83862041, -0.22786456],
#       [-0.22786456,  1.71533273]])

from scipy.stats import multivariate_normal
def multivariate_gaussian(X,mu,cov):
    p = multivariate_normal.pdf(X,mean=mu,cov=cov)  
    return p

pval = multivariate_gaussian(X1val, mu, cov)

pval.min(),pval.mean(),pval.max()
# (9.237683290526336e-41, 0.05861430429098488, 0.09036240676156386)

arr = np.linspace(-41,-1,40)
arr = [1 * 10**i for i in arr]

delta,f1,precision,recall = find_best_delta_with_arr(pval,y1val,arr)
delta,f1,precision,recall
# (8.376776400682856e-05, 0.8750000000000001, 1.0, 0.7777777777777778)

pval.shape
# (307,)

# plot result
outliers = np.where(pval < delta)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1val[:,0],X1val[:,1])
ax.scatter(X1val[outliers[0],0],X1val[outliers[0],1],s=50,c='y')  
miss = (pval < delta).astype(int) - y1val.ravel()
miss = np.where(miss==-1)
ax.scatter(X1val[miss[0],0],X1val[miss[0],1],s=50,marker='o',facecolors='none',edgecolors='r')
wrong = (pval < delta).astype(int) - y1val.ravel()
wrong = np.where(wrong==1)
ax.scatter(X1val[wrong[0],0],X1val[wrong[0],1],s=50,marker='o',facecolors='none',edgecolors='g')
ax.legend(['Normal','Anomaly','Missed Anomaly','Wrongly Labeled Normal'])
plt.show()

在这里插入图片描述
可以看到两种方法得到了一样的结果。根据分布概率来判断异常无法找到在簇中心附近的异常值,需要使用别的二分类方法才可以。

电影推荐

推荐引擎使用基于项目和用户的相似性度量来检查用户的历史偏好,以便为用户可能感兴趣的新“事物”提供建议。在本练习中,我们将实现一种称为协作过滤的特定推荐系统算法,并将其应用于电影评分的数据集。

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat

data = loadmat('../data_sets/ex8_movies.mat')
# Y is the rate matrix
# R[m,u]  marks whether user_u rated movie_m, equals 1 if true, else 0.
Y = data['Y']
R = data['R']
Y.shape, R.shape
# ((1682, 943), (1682, 943))

# flat params
def seriliaze(Y, R):
    return np.hstack((Y.flatten(), R.flatten()))

# reshape parems
def deserialize(params, n_movies, n_users, n_features):
    X = params[:n_movies * n_features].reshape(n_movies, n_features)
    theta = params[n_movies * n_features:].reshape(n_users, n_features)
    return X, theta

def cost(params, Y, R, n_features,lambd):
    X,theta = deserialize(params, Y.shape[0], Y.shape[1], n_features)
    # only rated values matter
    inner = np.multiply(X @ theta.T - Y, R)
    lambd = (lambd / 2) * (np.sum(X**2) + np.sum(theta**2))
    return np.sum(inner**2) / 2 + lambd

def gradient(params,Y,R,n_features,lambd):
    X,theta = deserialize(params, Y.shape[0], Y.shape[1], n_features)
    inner = np.multiply(X @ theta.T - Y, R)
    X_grad = inner @ theta + lambd * X
    theta_grad = inner.T @ X + lambd * theta
    return seriliaze(X_grad, theta_grad)

movies = []
f = open('../data_sets/movie_ids_mod.txt')
for line in f.readlines():
    tokens = line.strip().split(' ')
    movies.append(' '.join(tokens[1:]))
movies = np.array(movies)
movies_list = movies

# add a new user
ratings = np.zeros((1682, 1))
ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5

Y = data['Y']
Y = np.append(ratings,Y, axis=1)
print(Y.shape)
# (1682, 944)

R = data['R']
R = np.append( ratings != 0, R,axis=1)
R.shape
# (1682, 944)

movies = Y.shape[0]  # 1682
users = Y.shape[1]  # 944
features = 10

X = np.random.random(size=(movies, features))
theta = np.random.random(size=(users, features))
params = seriliaze(X, theta)

X.shape, theta.shape, params.shape
# ((1682, 10), (944, 10), (26260,))

#  mean normalization, treat missing value in Y as mean value
means = np.zeros((Y.shape[0],1))
for i in range(len(means)):
    non_zero = np.where(R[i] == 1)[0]
    means[i] = Y[i, non_zero].mean()

Y = Y - means
lambd = 10

from scipy.optimize import minimize
fmin = minimize(fun=cost, x0=params, args=(Y, R, features, lambd), 
                method='TNC', jac=gradient)
fmin
# message: Converged (|f_n-f_(n-1)| ~= 0)
# success: True
#  status: 1
#     fun: 38951.84755999088
#       x: [ 4.477e-01  7.687e-01 ... -8.899e-01 -4.198e-01]
#     nit: 35
#     jac: [ 3.041e-06  4.583e-06 ...  5.078e-07 -3.875e-10]
#    nfev: 440

params = fmin.x
X, theta = deserialize(params, movies, users, features)

predictions = X @ theta.T
# predicted rates of generated user
my_preds = predictions[:, 0] + means.flatten()
# sort by predicted rates
idx = np.argsort(my_preds)[::-1]

# top10 for the generated user
top10 = idx[:10]

for i in top10:
    print('Predicting rating %.1f for movie %s.' % (my_preds[i], movies_list[i]))
# Predicting rating 5.0 for movie Entertaining Angels: The Dorothy Day Story (1996).
# Predicting rating 5.0 for movie Someone Else's America (1995).
# Predicting rating 5.0 for movie Marlene Dietrich: Shadow and Light (1996).
# Predicting rating 5.0 for movie Santa with Muscles (1996).
# Predicting rating 5.0 for movie They Made Me a Criminal (1939).
# Predicting rating 5.0 for movie Saint of Fort Washington, The (1993).
# Predicting rating 5.0 for movie Star Kid (1997).
# Predicting rating 5.0 for movie Great Day in Harlem, A (1994).
# Predicting rating 5.0 for movie Prefontaine (1997).
# Predicting rating 5.0 for movie Aiqing wansui (1994).

数据集

链接: https://pan.baidu.com/s/1zteJBsMJ0GRwqRb5opOgwg 提取码: 78ah

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值