简介
本文是吴恩达《机器学习》习题八的python解答,包含异常检测和推荐系统。
异常检测代码
我们的第一个任务是使用高斯模型来检测数据集中未标记的示例是否应被视为异常。 我们先从简单的二维数据集开始。
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.io as sio
data = sio.loadmat('../data_sets/ex8data1.mat')
X1 = data['X']
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1[:,0],X1[:,1])
plt.show()
# check distribution of each feature
plt.hist(X1[:,0],bins=50)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution of x')
plt.show()
plt.hist(X1[:,1],bins=50)
plt.xlabel('Value')
plt.ylabel('Frequency')
plt.title('Distribution of y')
plt.show()
可以看到两个维度的分布,大致满足正态分布。如果不满足的话,可以对数据进行变换,尽量拟合正态分布。
def compute_gaussian_separately(X):
mu = np.mean(X,axis=0)
sigma = np.var(X,axis=0)
return mu,sigma
mu1, sigma1 = compute_gaussian_separately(X1)
mu1,sigma1
# (array([14.11222578, 14.99771051]), array([1.83263141, 1.70974533]))
# probability
def compute_res(x,y,mu,sigma):
return np.exp((-0.5)*(x-mu[0])**2/sigma[0] + (-0.5)*(y-mu[1])**2/sigma[1])
xplot = np.linspace(0,25,100)
yplot = np.linspace(0,25,100)
Xplot, Yplot = np.meshgrid(xplot,yplot)
Z = compute_res(Xplot,Yplot,mu1,sigma1)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1[:,0],X1[:,1])
ax.contour(Xplot,Yplot,Z,[10**-11,10**-7,10**-5,10**-3,10**-2,0.1,0.5,0.9])
plt.show()
from scipy import stats
X1val = data['Xval']
y1val = data['yval']
pval = np.zeros((X1val.shape[0],X1val.shape[1]))
pval[:,0] = stats.norm(mu1[0],sigma1[0]).pdf(X1val[:,0])
pval[:,1] = stats.norm(mu1[1],sigma1[1]).pdf(X1val[:,1])
def find_best_delta(p,y):
recall = 0
best_f1 = 0
best_delta = 0
final_presicion = 0
final_recall = 0
step_size = p.max() / 10000
for delta in np.arange(0, p.max(), step_size):
y_pred = (p<delta).astype(int)
y_pred = y_pred[:,0] + y_pred[:,1]
y_pred = y_pred.reshape(len(y_pred),1)
tp = np.sum((y==1)&(y_pred > 0))
fp = np.sum((y==0)&(y_pred > 0))
fn = np.sum((y==1)&(y_pred == 0))
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = 2*precision*recall/(precision+recall)
if f1 > best_f1:
best_f1 = f1
best_delta = delta
final_presicion = precision
final_recall = recall
return best_delta,best_f1,final_presicion,final_recall
def find_best_delta_with_arr(p,y,arr):
recall = 0
best_f1 = 0
best_delta = 0
final_presicion = 0
final_recall = 0
for delta in arr:
y_pred = (p<delta).astype(int)
y_pred = y_pred.reshape(len(y_pred),1)
y = y.reshape(len(y),1)
tp = np.sum((y==1)&(y_pred==1))
fp = np.sum((y==0)&(y_pred==1))
fn = np.sum((y==1)&(y_pred==0))
precision = tp/(tp+fp)
recall = tp/(tp+fn)
f1 = 2*precision*recall/(precision+recall)
if f1 > best_f1:
best_f1 = f1
best_delta = delta
final_presicion = precision
final_recall = recall
return best_delta,best_f1,final_presicion,final_recall
delta,f1,precision,recall = find_best_delta(pval,y1val)
delta,f1,precision,recall
# (0.00590335760854941, 0.8750000000000001, 1.0, 0.7777777777777778)
# plot result on validate dataset
val_pred = (pval<delta).astype(int)
val_pred = val_pred[:,0] + val_pred[:,1]
val_pred = val_pred.reshape(len(val_pred),1)
outliers = np.where(val_pred > 0)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1val[:,0],X1val[:,1])
ax.scatter(X1val[outliers[0],0],X1val[outliers[0],1],c='y')
miss = np.where((y1val==1)&(val_pred==0))
ax.scatter(X1val[miss[0],0],X1val[miss[0],1],s=50,marker='o',facecolors='none',edgecolors='r')
ax.legend(['Normal','Anomaly','Missed Anomaly'])
plt.show()
# plot result on train dataset
p = np.zeros((X1.shape[0],X1.shape[1]))
p[:,0] = stats.norm(mu1[0],sigma1[0]).pdf(X1[:,0])
p[:,1] = stats.norm(mu1[1],sigma1[1]).pdf(X1[:,1])
p = (p < delta).astype(int)
p = p[:,0] + p[:,1]
outliers = np.where(p > 0)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1[:,0],X1[:,1])
ax.scatter(X1[outliers[0],0],X1[outliers[0],1],s=50,marker='o',facecolors='none',edgecolors='r')
ax.legend(['Normal','Anomaly'])
plt.show()
上文我们是一个特征一个特征的去判断,当出现一个特征的值出现的概率低与阈值我们就认为是异常值。接下来我们整体一起考虑。
mu = np.mean(X1,axis=0)
cov = np.cov(X1.T)
cov
# array([[ 1.83862041, -0.22786456],
# [-0.22786456, 1.71533273]])
from scipy.stats import multivariate_normal
def multivariate_gaussian(X,mu,cov):
p = multivariate_normal.pdf(X,mean=mu,cov=cov)
return p
pval = multivariate_gaussian(X1val, mu, cov)
pval.min(),pval.mean(),pval.max()
# (9.237683290526336e-41, 0.05861430429098488, 0.09036240676156386)
arr = np.linspace(-41,-1,40)
arr = [1 * 10**i for i in arr]
delta,f1,precision,recall = find_best_delta_with_arr(pval,y1val,arr)
delta,f1,precision,recall
# (8.376776400682856e-05, 0.8750000000000001, 1.0, 0.7777777777777778)
pval.shape
# (307,)
# plot result
outliers = np.where(pval < delta)
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(X1val[:,0],X1val[:,1])
ax.scatter(X1val[outliers[0],0],X1val[outliers[0],1],s=50,c='y')
miss = (pval < delta).astype(int) - y1val.ravel()
miss = np.where(miss==-1)
ax.scatter(X1val[miss[0],0],X1val[miss[0],1],s=50,marker='o',facecolors='none',edgecolors='r')
wrong = (pval < delta).astype(int) - y1val.ravel()
wrong = np.where(wrong==1)
ax.scatter(X1val[wrong[0],0],X1val[wrong[0],1],s=50,marker='o',facecolors='none',edgecolors='g')
ax.legend(['Normal','Anomaly','Missed Anomaly','Wrongly Labeled Normal'])
plt.show()
可以看到两种方法得到了一样的结果。根据分布概率来判断异常无法找到在簇中心附近的异常值,需要使用别的二分类方法才可以。
电影推荐
推荐引擎使用基于项目和用户的相似性度量来检查用户的历史偏好,以便为用户可能感兴趣的新“事物”提供建议。在本练习中,我们将实现一种称为协作过滤的特定推荐系统算法,并将其应用于电影评分的数据集。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.io import loadmat
data = loadmat('../data_sets/ex8_movies.mat')
# Y is the rate matrix
# R[m,u] marks whether user_u rated movie_m, equals 1 if true, else 0.
Y = data['Y']
R = data['R']
Y.shape, R.shape
# ((1682, 943), (1682, 943))
# flat params
def seriliaze(Y, R):
return np.hstack((Y.flatten(), R.flatten()))
# reshape parems
def deserialize(params, n_movies, n_users, n_features):
X = params[:n_movies * n_features].reshape(n_movies, n_features)
theta = params[n_movies * n_features:].reshape(n_users, n_features)
return X, theta
def cost(params, Y, R, n_features,lambd):
X,theta = deserialize(params, Y.shape[0], Y.shape[1], n_features)
# only rated values matter
inner = np.multiply(X @ theta.T - Y, R)
lambd = (lambd / 2) * (np.sum(X**2) + np.sum(theta**2))
return np.sum(inner**2) / 2 + lambd
def gradient(params,Y,R,n_features,lambd):
X,theta = deserialize(params, Y.shape[0], Y.shape[1], n_features)
inner = np.multiply(X @ theta.T - Y, R)
X_grad = inner @ theta + lambd * X
theta_grad = inner.T @ X + lambd * theta
return seriliaze(X_grad, theta_grad)
movies = []
f = open('../data_sets/movie_ids_mod.txt')
for line in f.readlines():
tokens = line.strip().split(' ')
movies.append(' '.join(tokens[1:]))
movies = np.array(movies)
movies_list = movies
# add a new user
ratings = np.zeros((1682, 1))
ratings[0] = 4
ratings[6] = 3
ratings[11] = 5
ratings[53] = 4
ratings[63] = 5
ratings[65] = 3
ratings[68] = 5
ratings[97] = 2
ratings[182] = 4
ratings[225] = 5
ratings[354] = 5
Y = data['Y']
Y = np.append(ratings,Y, axis=1)
print(Y.shape)
# (1682, 944)
R = data['R']
R = np.append( ratings != 0, R,axis=1)
R.shape
# (1682, 944)
movies = Y.shape[0] # 1682
users = Y.shape[1] # 944
features = 10
X = np.random.random(size=(movies, features))
theta = np.random.random(size=(users, features))
params = seriliaze(X, theta)
X.shape, theta.shape, params.shape
# ((1682, 10), (944, 10), (26260,))
# mean normalization, treat missing value in Y as mean value
means = np.zeros((Y.shape[0],1))
for i in range(len(means)):
non_zero = np.where(R[i] == 1)[0]
means[i] = Y[i, non_zero].mean()
Y = Y - means
lambd = 10
from scipy.optimize import minimize
fmin = minimize(fun=cost, x0=params, args=(Y, R, features, lambd),
method='TNC', jac=gradient)
fmin
# message: Converged (|f_n-f_(n-1)| ~= 0)
# success: True
# status: 1
# fun: 38951.84755999088
# x: [ 4.477e-01 7.687e-01 ... -8.899e-01 -4.198e-01]
# nit: 35
# jac: [ 3.041e-06 4.583e-06 ... 5.078e-07 -3.875e-10]
# nfev: 440
params = fmin.x
X, theta = deserialize(params, movies, users, features)
predictions = X @ theta.T
# predicted rates of generated user
my_preds = predictions[:, 0] + means.flatten()
# sort by predicted rates
idx = np.argsort(my_preds)[::-1]
# top10 for the generated user
top10 = idx[:10]
for i in top10:
print('Predicting rating %.1f for movie %s.' % (my_preds[i], movies_list[i]))
# Predicting rating 5.0 for movie Entertaining Angels: The Dorothy Day Story (1996).
# Predicting rating 5.0 for movie Someone Else's America (1995).
# Predicting rating 5.0 for movie Marlene Dietrich: Shadow and Light (1996).
# Predicting rating 5.0 for movie Santa with Muscles (1996).
# Predicting rating 5.0 for movie They Made Me a Criminal (1939).
# Predicting rating 5.0 for movie Saint of Fort Washington, The (1993).
# Predicting rating 5.0 for movie Star Kid (1997).
# Predicting rating 5.0 for movie Great Day in Harlem, A (1994).
# Predicting rating 5.0 for movie Prefontaine (1997).
# Predicting rating 5.0 for movie Aiqing wansui (1994).
数据集
链接: https://pan.baidu.com/s/1zteJBsMJ0GRwqRb5opOgwg 提取码: 78ah