吴恩达《机器学习》ex8

1emonade

于 2023-09-01 10:24:47 发布

阅读量71

点赞数

分类专栏：吴恩达机器学习文章标签：机器学习人工智能

本文链接：https://blog.csdn.net/weixin_62040971/article/details/132605918

版权

吴恩达机器学习专栏收录该内容

8 篇文章 0 订阅

订阅专栏

简介

本文是吴恩达《机器学习》习题八的python解答，包含异常检测和推荐系统。

异常检测代码

我们的第一个任务是使用高斯模型来检测数据集中未标记的示例是否应被视为异常。我们先从简单的二维数据集开始。

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.io as sio

data = sio.loadmat('../data_sets/ex8data1.mat')
X1 = data['X']
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1[:,0],X1[:,1])
plt.show()

在这里插入图片描述

# check distribution of each feature
plt.hist(X1[:,0],bins=50)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution of x')
plt.show()

在这里插入图片描述

plt.hist(X1[:,1],bins=50)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution of y')
plt.show()

在这里插入图片描述
可以看到两个维度的分布，大致满足正态分布。如果不满足的话，可以对数据进行变换，尽量拟合正态分布。

def compute_gaussian_separately(X):
    mu = np.mean(X,axis=0)
    sigma = np.var(X,axis=0)
    return mu,sigma

mu1, sigma1 = compute_gaussian_separately(X1)
mu1,sigma1
# (array([14.11222578, 14.99771051]), array([1.83263141, 1.70974533]))

#  probability
def compute_res(x,y,mu,sigma):
    return np.exp((-0.5)*(x-mu[0])**2/sigma[0] + (-0.5)*(y-mu[1])**2/sigma[1])

xplot = np.linspace(0,25,100)
yplot = np.linspace(0,25,100)
Xplot, Yplot = np.meshgrid(xplot,yplot)
Z = compute_res(Xplot,Yplot,mu1,sigma1)

fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1[:,0],X1[:,1])
ax.contour(Xplot,Yplot,Z,[10**-11,10**-7,10**-5,10**-3,10**-2,0.1,0.5,0.9])
plt.show()

在这里插入图片描述

from scipy import stats

X1val = data['Xval']
y1val = data['yval']
pval = np.zeros((X1val.shape[0],X1val.shape[1]))
pval[:,0] = stats.norm(mu1[0],sigma1[0]).pdf(X1val[:,0])
pval[:,1] = stats.norm(mu1[1],sigma1[1]).pdf(X1val[:,1])

def find_best_delta(p,y):
    recall = 0
    best_f1 = 0
    best_delta = 0
    final_presicion = 0
    final_recall = 0
    step_size = p.max() / 10000
    for delta in np.arange(0, p.max(), step_size):
        y_pred = (p<delta).astype(int)
        y_pred = y_pred[:,0] + y_pred[:,1]
        y_pred = y_pred.reshape(len(y_pred),1)
        tp = np.sum((y==1)&(y_pred > 0))
        fp = np.sum((y==0)&(y_pred > 0))
        fn = np.sum((y==1)&(y_pred == 0))
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        f1 = 2*precision*recall/(precision+recall)
        if f1 > best_f1:
            best_f1 = f1
            best_delta = delta
            final_presicion = precision
            final_recall = recall
    return best_delta,best_f1,final_presicion,final_recall

def find_best_delta_with_arr(p,y,arr):
    recall = 0
    best_f1 = 0
    best_delta = 0
    final_presicion = 0
    final_recall = 0
    for delta in arr:
        y_pred = (p<delta).astype(int)
        y_pred = y_pred.reshape(len(y_pred),1)
        y = y.reshape(len(y),1)
        tp = np.sum((y==1)&(y_pred==1))
        fp = np.sum((y==0)&(y_pred==1))
        fn = np.sum((y==1)&(y_pred==0))
        precision = tp/(tp+fp)
        recall = tp/(tp+fn)
        f1 = 2*precision*recall/(precision+recall)
        if f1 > best_f1:
            best_f1 = f1
            best_delta = delta
            final_presicion = precision
            final_recall = recall
    return best_delta,best_f1,final_presicion,final_recall

delta,f1,precision,recall = find_best_delta(pval,y1val)
delta,f1,precision,recall
# (0.00590335760854941, 0.8750000000000001, 1.0, 0.7777777777777778)

# plot result on validate dataset
val_pred = (pval<delta).astype(int)
val_pred = val_pred[:,0] + val_pred[:,1]
val_pred = val_pred.reshape(len(val_pred),1)
outliers = np.where(val_pred > 0)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1val[:,0],X1val[:,1])
ax.scatter(X1val[outliers[0],0],X1val[outliers[0],1],c='y')  
miss = np.where((y1val==1)&(val_pred==0))
ax.scatter(X1val[miss[0],0],X1val[miss[0],1],s=50,marker='o',facecolors='none',edgecolors='r')
ax.legend(['Normal','Anomaly','Missed Anomaly'])
plt.show()

在这里插入图片描述

# plot result on train dataset
p = np.zeros((X1.shape[0],X1.shape[1]))
p[:,0] = stats.norm(mu1[0],sigma1[0]).pdf(X1[:,0])
p[:,1] = stats.norm(mu1[1],sigma1[1]).pdf(X1[:,1])
p = (p < delta).astype(int)
p = p[:,0] + p[:,1]
outliers = np.where(p > 0)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1[:,0],X1[:,1])
ax.scatter(X1[outliers[0],0],X1[outliers[0],1],s=50,marker='o',facecolors='none',edgecolors='r')  
ax.legend(['Normal','Anomaly'])
plt.show()

在这里插入图片描述
上文我们是一个特征一个特征的去判断，当出现一个特征的值出现的概率低与阈值我们就认为是异常值。接下来我们整体一起考虑。

mu = np.mean(X1,axis=0)
cov = np.cov(X1.T)
cov
# array([[ 1.83862041, -0.22786456],
#       [-0.22786456,  1.71533273]])

from scipy.stats import multivariate_normal
def multivariate_gaussian(X,mu,cov):
    p = multivariate_normal.pdf(X,mean=mu,cov=cov)  
    return p

pval = multivariate_gaussian(X1val, mu, cov)

pval.min(),pval.mean(),pval.max()
# (9.237683290526336e-41, 0.05861430429098488, 0.09036240676156386)

arr = np.linspace(-41,-1,40)
arr = [1 * 10**i for i in arr]

delta,f1,precision,recall = find_best_delta_with_arr(pval,y1val,arr)
delta,f1,precision,recall
# (8.376776400682856e-05, 0.8750000000000001, 1.0, 0.7777777777777778)

pval.shape
# (307,)

# plot result
outliers = np.where(pval < delta)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1val[:,0],X1val[:,1])
ax.scatter(X1val[outliers[0],0],X1val[outliers[0],1],s=50,c='y')  
miss = (pval < delta).astype(int) - y1val.ravel()
miss = np.where(miss==-1)
ax.scatter(X1val[miss[0],0],X1val[miss[0],1],s=50,marker='o',facecolors='none',edgecolors='r')
wrong = (pval < delta).astype(int) - y1val.ravel()
wrong = np.where(wrong==1)
ax.scatter(X1val[wrong[0],0],X1val[wrong[0],1],s=50,marker='o',facecolors='none',edgecolors='g')
ax.legend(['Normal','Anomaly','Missed Anomaly','Wrongly Labeled Normal'])
plt.show()

在这里插入图片描述
可以看到两种方法得到了一样的结果。根据分布概率来判断异常无法找到在簇中心附近的异常值，需要使用别的二分类方法才可以。

电影推荐

推荐引擎使用基于项目和用户的相似性度量来检查用户的历史偏好，以便为用户可能感兴趣的新“事物”提供建议。在本练习中，我们将实现一种称为协作过滤的特定推荐系统算法，并将其应用于电影评分的数据集。

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat

data = loadmat('../data_sets/ex8_movies.mat')
# Y is the rate matrix
# R[m,u]  marks whether user_u rated movie_m, equals 1 if true, else 0.
Y = data['Y']
R = data['R']
Y.shape, R.shape
# ((1682, 943), (1682, 943))

# flat params
def seriliaze(Y, R):
    return np.hstack((Y.flatten(), R.flatten()))

# reshape parems
def deserialize(params, n_movies, n_users, n_features):
    X = params[:n_movies * n_features].reshape(n_movies, n_features)
    theta = params[n_movies * n_features:].reshape(n_users, n_features)
    return X, theta

def cost(params, Y, R, n_features,lambd):
    X,theta = deserialize(params, Y.shape[0], Y.shape[1], n_features)
    # only rated values matter
    inner = np.multiply(X @ theta.T - Y, R)
    lambd = (lambd / 2) * (np.sum(X**2) + np.sum(theta**2))
    return np.sum(inner**2) / 2 + lambd

def gradient(params,Y,R,n_features,lambd):
    X,theta = deserialize(params, Y.shape[0], Y.shape[1], n_features)
    inner = np.multiply(X @ theta.T - Y, R)
    X_grad = inner @ theta + lambd * X
    theta_grad = inner.T @ X + lambd * theta
    return seriliaze(X_grad, theta_grad)

movies = []
f = open('../data_sets/movie_ids_mod.txt')
for line in f.readlines():
    tokens = line.strip().split(' ')
    movies.append(' '.join(tokens[1:]))
movies = np.array(movies)
movies_list = movies

# add a new user
ratings = np.zeros((1682, 1))
ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5

Y = data['Y']
Y = np.append(ratings,Y, axis=1)
print(Y.shape)
# (1682, 944)

R = data['R']
R = np.append( ratings != 0, R,axis=1)
R.shape
# (1682, 944)

movies = Y.shape[0]  # 1682
users = Y.shape[1]  # 944
features = 10

X = np.random.random(size=(movies, features))
theta = np.random.random(size=(users, features))
params = seriliaze(X, theta)

X.shape, theta.shape, params.shape
# ((1682, 10), (944, 10), (26260,))

#  mean normalization, treat missing value in Y as mean value
means = np.zeros((Y.shape[0],1))
for i in range(len(means)):
    non_zero = np.where(R[i] == 1)[0]
    means[i] = Y[i, non_zero].mean()

Y = Y - means
lambd = 10

from scipy.optimize import minimize
fmin = minimize(fun=cost, x0=params, args=(Y, R, features, lambd), 
                method='TNC', jac=gradient)
fmin
# message: Converged (|f_n-f_(n-1)| ~= 0)
# success: True
#  status: 1
#     fun: 38951.84755999088
#       x: [ 4.477e-01  7.687e-01 ... -8.899e-01 -4.198e-01]
#     nit: 35
#     jac: [ 3.041e-06  4.583e-06 ...  5.078e-07 -3.875e-10]
#    nfev: 440

params = fmin.x
X, theta = deserialize(params, movies, users, features)

predictions = X @ theta.T
# predicted rates of generated user
my_preds = predictions[:, 0] + means.flatten()
# sort by predicted rates
idx = np.argsort(my_preds)[::-1]

# top10 for the generated user
top10 = idx[:10]

for i in top10:
    print('Predicting rating %.1f for movie %s.' % (my_preds[i], movies_list[i]))
# Predicting rating 5.0 for movie Entertaining Angels: The Dorothy Day Story (1996).
# Predicting rating 5.0 for movie Someone Else's America (1995).
# Predicting rating 5.0 for movie Marlene Dietrich: Shadow and Light (1996).
# Predicting rating 5.0 for movie Santa with Muscles (1996).
# Predicting rating 5.0 for movie They Made Me a Criminal (1939).
# Predicting rating 5.0 for movie Saint of Fort Washington, The (1993).
# Predicting rating 5.0 for movie Star Kid (1997).
# Predicting rating 5.0 for movie Great Day in Harlem, A (1994).
# Predicting rating 5.0 for movie Prefontaine (1997).
# Predicting rating 5.0 for movie Aiqing wansui (1994).