Anomaly Detection ex8.py
from scipy.io import loadmat
from ex8Modules import *
import numpy as np
#Part 1:low-dimensional exmaples
data=loadmat('ex8data1.mat')
X=data['X'] #X.shape (307,2)
Xval=data['Xval'] #Xval.shape (307,2)
yval=data['yval'] #yval.shape (307,1)
visualize(X)
mu,sigma=estimateGaussian(X)
visualizeFit(X,mu,sigma,0,0)
p=multiGaussian(X,mu,sigma)
pval=multiGaussian(Xval,mu,sigma)
_,epsilon=selectThreshold(yval,pval)
outliers=np.where(p<epsilon)
visualizeFit(X,mu,sigma,outliers,1)
#Part 2:High-dimensional exmaples
data=loadmat('ex8data2.mat')
X=data['X']
Xval=data['Xval']
yval=data['yval']
mu,sigma=estimateGaussian(X)
p=multiGaussian(X,mu,sigma)
pval=multiGaussian(Xval,mu,sigma)
F1,epsilon=selectThreshold(yval,pval)
print("Best epsilon found using cv-sets: ",epsilon)
print("Best F1 score on cv-sets: ",F1)
print("Outliers found:",np.sum(p<epsilon))
Recommender System
ex8cofi.py
from scipy.io import loadmat
import numpy as np
import matplotlib.pyplot as plt
import scipy.optimize as op
from ex8Modules import *
#Part 1: Loading movie ratings dataset
moviedata=loadmat('ex8_movies.mat')
Y=moviedata['Y']
R=moviedata['R']
ysum=np.sum(Y[0,:])
rsum=np.where(R[0,:]==1)[0].shape[0]
print("Average rating for movie 1 (Toy Story): %f /5\n" % (ysum/rsum))
plt.imshow(Y,aspect='auto')
plt.ylabel("Movies")
plt.xlabel("Users")
plt.show()
#Part 2: Collaborative Filtering Cost Function and Gradient
params=loadmat('ex8_movieParams.mat')
X=params['X']
Theta=params['Theta']
num_users=4
num_movies=5
num_features=3
X_test=X[:num_movies,:num_features]
Theta_test=Theta[:num_users,:num_features]
Y_test=Y[:num_movies,:num_users]
R_test=R[:num_movies,:num_users]
param=np.r_[X_test,Theta_test].ravel()
J=cofiCostFunc(param,Y_test,R_test,num_users,num_movies,num_features,lamda=0)
print("Cost at loaded parameters with lambda=0: %f (should be about 22.22)" % J)
J=cofiCostFunc(param,Y_test,R_test,num_users,num_movies,num_features,lamda=1.5)
print("Cost at loaded parameters with lambda=1.5: %f (should be about 34.34)" % J)
#Part 3: Ratings for a new user
movieList=loadMovieList()
my_ratings=np.zeros((len(movieList),1))
my_ratings[0]=4
my_ratings[97]=2
my_ratings[6]=3
my_ratings[11]= 5
my_ratings[53] = 4
my_ratings[63]= 5
my_ratings[65]= 3
my_ratings[68] = 5
my_ratings[182] = 4
my_ratings[225] = 5
my_ratings[354]= 5
print("\nNew user ratings: ")
for i in range(len(movieList)):
if my_ratings[i]>0:
print("Rated %d for %s" % (my_ratings[i],movieList[i]))
#Part 4: Learning Movie Ratings
Y=np.column_stack((my_ratings,Y))
R=np.column_stack(((my_ratings!=0)+0,R))
ymean,ynorm=normalizeRatings(Y,R)
num_users=Y.shape[1]
num_movies=Y.shape[0]
num_features=10
lamda=10
X=loadmat('X.mat')['X']
Theta=loadmat('Theta.mat')['Theta']
init_params=np.r_[X,Theta].ravel()
print("\nTraining collaborative filtering...")
results=op.fmin_cg(f=cofiCostFunc,x0=init_params,\
args=(ynorm,R,num_users,num_movies,num_features,lamda),fprime=cofiGrads)
X=results[:num_movies*num_features].reshape((num_movies,num_features))
Theta=results[num_movies*num_features:].reshape((num_users,num_features))
print("Recommender system learning completed.")
#Part 5: Recommendation for you
p=X.dot(Theta.T)
my_pred=p[:,0]+ymean.ravel()
pred_idx=np.argsort(-my_pred)
pred_score=np.sort(-my_pred)
print("\nTop recommendations for you :")
for i in range(10):
print('Predicting rating %.1f for movie %s' % (-pred_score[i],movieList[pred_idx[i]]))
print("\nOriginal user ratings: ")
for i in range(len(movieList)):
if my_ratings[i]>0:
print("Rated %d for %s" % (my_ratings[i],movieList[i]))
print()
#Part 6: Find closest movies
findCloestMovies(my_ratings,X,movieList,10)
这里在初始化电影特征矩阵X和用户喜好矩阵Theta的时候,直接用了matlab里面randn生成的mat矩阵。如果使用numpy.random.randn,cost非常的大,高达60W,而且评分也不对。如果乘以一个很小的系数(改变sigma),结果就正常了。具体原因不知。。。可能matlab跟Numpy生成标准正态分布的方式不一样。
X=loadmat('X.mat')['X']
Theta=loadmat('Theta.mat')['Theta']
X=np.random.randn(num_movies,num_features)
Theta=np.random.randn(num_users,num_features)
X=np.random.randn(num_movies,num_features)*0.1
Theta=np.random.randn(num_users,num_features)*0.1
random.randn
random.randn*0.1
最后,写了个找到与评分电影相似的k个电影:
ex8Modules.py
import numpy as np
import math
import matplotlib.pyplot as plt
def estimateGaussian(X):
(m,n)=X.shape
mu=np.zeros((n,1))
sigma2=np.zeros((n,1))
for i in range(n):
mu[i]=np.mean(X[:,i])
sigma2[i]=np.sum(np.square(X[:,i]-mu[i]))/m
return mu,sigma2
def multiGaussian(X,mu,sigma2):
(n,m) = mu.shape
mu = mu.reshape((m, n))
if sigma2.shape[0] == 1 or sigma2.shape[1] == 1:
sigma2 = np.diag(sigma2.ravel())
p = (2 * math.pi) ** (-n / 2) * np.linalg.det(sigma2) ** (-0.5) \
* np.exp(-0.5 * np.sum((X - mu).dot(np.linalg.pinv(sigma2)) * (X - mu), axis=1))
return p
def visualize(X):
plt.scatter(X[:, 0], X[:, 1], marker='x', c='b')
plt.xlim(0, 30)
plt.ylim(0, 30)
plt.xlabel('Latency (ms)')
plt.ylabel('Throughput (mb/s)')
plt.show()
def visualizeFit(X,mu,sigma2,outliers,flag):
X1, X2 = np.meshgrid(np.arange(0, 35.5, .5), np.arange(0, 35.5, .5))
XX = np.zeros((71 * 71, 2))
for i in range(71):
XX[i * 71:(i + 1) * 71, 0] = i * 0.5
XX[i * 71:(i + 1) * 71, 1] = np.arange(0, 35.5, .5)
Z = multiGaussian(XX, mu, sigma2).reshape(71,-1).T
plt.scatter(X[:, 0], X[:, 1], marker='x', c='b')
if (np.sum(np.isinf(Z) == 0)):
plt.contour(X1, X2, Z, 10.0 ** np.arange(-20, 0, 3))
if(flag==1):
plt.scatter(X[outliers[0],0],X[outliers[0],1],marker='x',c='r')
plt.xlabel('Latency (ms)')
plt.ylabel('Throughput (mb/s)')
plt.xlim(0,30)
plt.ylim(0,30)
plt.show()
def selectThreshold(yval,pval):
bestEpsilon = 0
bestF1 = 0
pmin = np.min(pval)
pmax = np.max(pval)
stepsize = (pmax - pmin) / 1000
for epsilon in np.arange(pmin, pmax, stepsize):
cvPredictions = np.zeros((pval.shape[0], 1))
for i in range(pval.shape[0]):
if pval[i] < epsilon:
cvPredictions[i] = 1
fp = np.sum((yval == 0) & (cvPredictions == 1) + 0)
tp = np.sum((yval == 1) & (cvPredictions == 1) + 0)
fn = np.sum((yval == 1) & (cvPredictions == 0) + 0)
prec = tp / (tp + fp + 1e-20)
rec = tp / (tp + fn + 1e-20)
F1 = 2 * prec * rec / (prec + rec + 1e-20)
if F1 > bestF1:
bestF1 = F1
bestEpsilon = epsilon
return bestF1,bestEpsilon
def cofiCostFunc(params,Y,R,num_users,num_movies,num_features,lamda):
X=params[:num_movies*num_features].reshape((num_movies,num_features))
Theta=params[num_features*num_movies:].reshape((num_users,num_features))
J = np.sum(np.square(R * X.dot(Theta.T) - Y)) / 2 + lamda / 2 * np.sum(np.square(Theta)) \
+ lamda / 2 * np.sum(np.square(X))
return J
def cofiGrads(params,Y,R,num_users,num_movies,num_features,lamda):
X = params[:num_movies * num_features].reshape((num_movies, num_features))
Theta = params[num_features * num_movies:].reshape((num_users, num_features))
X_grads = ((R * X.dot(Theta.T) - Y).dot(Theta) + lamda * X).ravel('F')
Theta_grads = ((R * X.dot(Theta.T) - Y).T.dot(X) + lamda * Theta).ravel('F')
grads = np.r_[X_grads, Theta_grads]
return grads.flatten()
def loadMovieList():
with open('movie_ids.txt', 'rb') as f:
lines = f.readlines()
moviesList = []
for line in lines:
line = str(line).split(' ', 1)
newline = line[1].replace('\\n', '')
newline = newline.replace('\'', '')
newline = newline.replace('\"', '')
moviesList.append(newline)
return moviesList
def normalizeRatings(Y,R):
ymean = np.zeros((Y.shape[0], 1))
ynorm = np.zeros(Y.shape)
for i in range(R.shape[0]):
idx = np.where(R[i, :] == 1)[0]
numrated = len(idx)
ymean[i] = np.sum(Y[i, :]) / numrated
ynorm[i,idx] = Y[i, idx] - ymean[i]
return ymean,ynorm
def findCloestMovies(my_ratings,X,movieList,num):
j = 0
myFavormovies=X[np.where(my_ratings>0)[0],:]
recmvList = np.zeros((myFavormovies.shape[0], num))
for movie in myFavormovies:
dist = np.zeros((X.shape[0], 1))
for i in range(X.shape[0]):
dist[i] = np.linalg.norm(movie - X[i, :])
sorted = np.argsort((-dist).ravel())[1:num+1]
recmvList[j:] = sorted
j = j + 1
j = -1
for i in range(len(movieList)):
if my_ratings[i] > 0:
j = j + 1
idx = recmvList[j, :]
print("%d movies similar to movie %s is" % (num,movieList[i]))
for k in range(num):
print(" %d %s"%(k+1,movieList[int(idx[k])]))