1.写在前面
吴恩达机器学习的课后作业及数据可以在coursera平台上进行下载,只要注册一下就可以添加课程了。所以这里就不写题目和数据了,有需要的小伙伴自行去下载就可以了。
作业及数据下载网址:吴恩达机器学习课程
2.异常检测作业1
作业一中主要是异常检测,并标注异常点,分别采用单变量高斯分布和多变量高斯分布。
下面附上代码,有详细的注释,这里就不一一解释了。
# author:FLC
# time:2021/7/10
import numpy as np
import scipy.io as scio
import math
import matplotlib.pyplot as plt
# 用于导入数据的函数
def input_data():
# 导入训练集的路径
data_file = 'machine-learning-ex8\\machine-learning-ex8\\ex8\\ex8data1.mat'
# 导入训练集
data = scio.loadmat(data_file)
X = data['X']
X_val = data['Xval']
y_val = data['yval']
# # 可视化原始数据
# fig,ax = plt.subplots(1,1)
# ax.scatter(X[:,0],X[:,1])
# plt.show()
return X, X_val, y_val
# 用于计算单变量高斯分布
def univariate_gaussian_distribution(X, mean, varinace):
# 根据单变量高斯分布公式计算每种特征的概率
p = np.exp(-np.power((X - mean), 2) / (2 * variance)) / (np.sqrt(2 * math.pi * variance))
answer_p = p[:, 0]
for i in range(1, p.shape[1]):
answer_p = answer_p * p[:, i] # 假设相互独立,累乘计算总体概率
return answer_p
# 用于计算多变量高斯分布
def multivariate_gaussian_distribution(X, mean, covariance):
# 根据多变量高斯分布公式计算总的概率
p = np.exp(-0.5 * np.diag((X - mean) @ np.linalg.inv(covariance) @ (X - mean).T)) / (
np.power((2 * math.pi), 0.5 * X.shape[1]) * np.power(np.linalg.det(covariance), 0.5))
return p
# 用于选取阈值的函数
def select_threshold(X_val, y_val, mean, variance, covariance):
p = univariate_gaussian_distribution(X_val, mean, variance) # 调用单变量高斯分布函数
# p = multivariate_gaussian_distribution(X_val, mean, covariance) # 调用多变量高斯分布函数
choose_epsilon = 0 # 定义初始阈值
f = 0 # 定义初始f值
for epsilon in np.linspace(min(p), max(p), 1000): # 遍历每一种阈值
tp = 0 # true positive
fp = 0 # false positive
fn = 0 # false negative
for i in range(0, p.shape[0]): # 遍历每一个概率
if p[i] < epsilon and y_val[i] == 1: # true positive
tp += 1
elif p[i] < epsilon and y_val[i] == 0: # false positive
fp += 1
elif p[i] >= epsilon and y_val[i] == 1: # false negative
fn += 1
prec = tp / (tp + fp) if (tp + fp) != 0 else 0 # 计算精确率
rec = tp / (tp + fn) if (tp + fn) != 0 else 0 # 计算召回率
com_f = 2 * prec * rec / (prec + rec) if (prec + rec) != 0 else 0 # 计算f值
if com_f > f:
choose_epsilon = epsilon # 如果得到更好f值的阈值,那么就进行更新
f = com_f
return choose_epsilon, f
# 用于绘制等高线的函数
def plotContour(X, X_val, y_val, mean, variance, covariance):
choose_epsilon, f = select_threshold(X_val, y_val, mean, variance, covariance) # 调用选取最优阈值的函数
p = univariate_gaussian_distribution(X, mean, variance) # 调用单变量高斯分布计算概率
# p = multivariate_gaussian_distribution(X, mean, covariance) # 调用多变量高斯分布计算概率
# 可视化原始数据
fig, ax = plt.subplots(1, 1)
ax.scatter(X[:, 0], X[:, 1]) # 绘制原始数据散点
anoms = np.array([X[i] for i in range(0, X.shape[0]) if p[i] < choose_epsilon]) # 挑选异常点
ax.scatter(anoms[:, 0], anoms[:, 1], color='r', marker='x') # 标记异常点
# 用于绘制等高线
x_min, x_max = 0, 30
y_min, y_max = 0, 30
x = np.arange(x_min, x_max, 0.3)
y = np.arange(y_min, y_max, 0.3)
xx, yy = np.meshgrid(x, y) # 生成网格
z = univariate_gaussian_distribution(np.c_[xx.ravel(), yy.ravel()], mean, variance) # 调用单变量高斯分布
# z = multivariate_gaussian_distribution(np.c_[xx.ravel(), yy.ravel()], mean, covariance) # 调用多变量高斯分布
zz = z.reshape(xx.shape) # 重构一下维度
cont_levels = [10 ** h for h in range(-20, 0, 3)]
ax.contour(xx, yy, zz, cont_levels) # 绘制等高线图
plt.show()
X, X_val, y_val = input_data() # 导入数据
# 特别注意:在绘制等高线时要使用原始数据的均值、方差、协方差
mean = np.mean(X, axis=0) # 计算原始数据的均值
variance = np.var(X, axis=0) # 计算方差,用于单变量高斯分布
covariance = np.cov(X, rowvar=False) # 计算协方差,用于多变量高斯分布
choose_epsilon, f = select_threshold(X_val, y_val, mean, variance, covariance) # 挑选阈值
print('choose_epsilon:', choose_epsilon, ' f:', f) # 打印阈值和f
plotContour(X, X_val, y_val, mean, variance, covariance) # 绘制等高线图
结果展示:
单变量高斯分布:
多变量高斯分布:
3.异常检测作业2
作业二中主要是异常检测,实际上知识对作业1升高了数据的维度(并没有什么区别),分别采用单变量高斯分布和多变量高斯分布。
下面附上代码,有详细的注释,这里就不一一解释了。
# author:FLC
# time:2021/7/11
import numpy as np
import scipy.io as scio
import math
import matplotlib.pyplot as plt
# 用于导入数据的函数
def input_data():
# 导入训练集的路径
data_file = 'machine-learning-ex8\\machine-learning-ex8\\ex8\\ex8data2.mat'
# 导入训练集
data = scio.loadmat(data_file)
X = data['X']
X_val = data['Xval']
y_val = data['yval']
return X, X_val, y_val
# 用于计算单变量高斯分布
def univariate_gaussian_distribution(X, mean, varinace):
# 根据单变量高斯分布公式计算每种特征的概率
p = np.exp(-np.power((X - mean), 2) / (2 * variance)) / (np.sqrt(2 * math.pi * variance))
answer_p = p[:, 0]
for i in range(1, p.shape[1]):
answer_p = answer_p * p[:, i] # 假设相互独立,累乘计算总体概率
return answer_p
# 用于计算多变量高斯分布
def multivariate_gaussian_distribution(X, mean, covariance):
# 根据多变量高斯分布公式计算总的概率
p = np.exp(-0.5 * np.diag((X - mean) @ np.linalg.inv(covariance) @ (X - mean).T)) / (
np.power((2 * math.pi), 0.5 * X.shape[1]) * np.power(np.linalg.det(covariance), 0.5))
return p
# 用于选取阈值的函数
def select_threshold(X_val, y_val, mean, variance, covariance):
p = univariate_gaussian_distribution(X_val, mean, variance) # 调用单变量高斯分布函数
# p = multivariate_gaussian_distribution(X_val, mean, covariance) # 调用多变量高斯分布函数
choose_epsilon = 0 # 定义初始阈值
f = 0 # 定义初始f值
for epsilon in np.linspace(min(p), max(p), 1000): # 遍历每一种阈值
tp = 0 # true positive
fp = 0 # false positive
fn = 0 # false negative
for i in range(0, p.shape[0]): # 遍历每一个概率
if p[i] < epsilon and y_val[i] == 1: # true positive
tp += 1
elif p[i] < epsilon and y_val[i] == 0: # false positive
fp += 1
elif p[i] >= epsilon and y_val[i] == 1: # false negative
fn += 1
prec = tp / (tp + fp) if (tp + fp) != 0 else 0 # 计算精确率
rec = tp / (tp + fn) if (tp + fn) != 0 else 0 # 计算召回率
com_f = 2 * prec * rec / (prec + rec) if (prec + rec) != 0 else 0 # 计算f值
if com_f > f:
choose_epsilon = epsilon # 如果得到更好f值的阈值,那么就进行更新
f = com_f
return choose_epsilon, f
X, X_val, y_val = input_data()
mean = np.mean(X, axis=0)
variance = np.var(X, axis=0)
covariance = np.cov(X, rowvar=False)
choose_epsilon, f = select_threshold(X_val, y_val, mean, variance, covariance)
# p = univariate_gaussian_distribution(X, mean, variance)
p = multivariate_gaussian_distribution(X, mean, covariance)
num = sum(p < choose_epsilon)
# print('单变量高斯分布')
print('多变量高斯分布')
print('异常点个数', num)
print('阈值', choose_epsilon)
结果展示:
单变量高斯分布:
多变量高斯分布:
4.推荐系统作业
推荐系统作业就是对用户进行电影的推荐,和他视频中的案例是一样的
下面附上代码,有详细的注释,这里就不一一解释了。
# author:FLC
# time:2021/7/11
import numpy as np
import scipy.io as scio
from scipy.optimize import minimize # 导入优化和拟合库中的最小优化
# 用于导入数据的函数
def input_data():
# 导入训练集的路径
data_file = 'machine-learning-ex8\\machine-learning-ex8\\ex8\\ex8_movies.mat'
# 导入训练集
data = scio.loadmat(data_file)
Y = data['Y']
R = data['R']
return Y, R
# 用于计算代价函数
def computeCosts(theta_x, Y, R,lamda):
# 对theta和x进行解序列化
theta,x = deserialize(theta_x, Y.shape[1], 10, Y.shape[0], 10)
# 代价函数的第一部分
costsJ1 = 0.5 * np.sum(np.power((x @ theta.T - Y) * R, 2))
# 代价函数的第二部分
costsJ2 = lamda * (np.sum(np.power(x,2)) + np.sum(np.power(theta,2))) / 2
return costsJ1 + costsJ2
# 用于计算梯度的函数
def computeGradient(theta_x,Y,R,lamda):
# 对theta和x进行解序列化
theta, x = deserialize(theta_x, Y.shape[1], 10, Y.shape[0], 10)
# 根据公式计算x的梯度
gradient_x = ((x@theta.T-Y)* R)@theta + lamda * x
# 根据公式计算y的梯度
gradient_theta = ((x @ theta.T - Y)*R).T @ x + lamda * theta
# 对theta和x进行序列化操作
serialize_data = serialize(gradient_theta,gradient_x)
return serialize_data
# 计算均值化的函数
def normalize(Y,R):
u = np.sum(Y,axis=1) / (np.sum(R==1,axis=1)+1e-6) # 计算每一行的均值
u = u.reshape(-1,1) # 将一维变成一个列向量,方便下一步的计算
Y = (Y - u) * R # Y-均值,要注意乘上R
return u,Y
# 进行序列化的函数
def serialize(theta,x):
return np.append(theta.flatten(),x.flatten())
# 进行反序列化的函数
def deserialize(serialize_data,theta1,theta2,x1,x2):
serialize_data1 = serialize_data[0:theta1*theta2]
serialize_data2 = serialize_data[theta1*theta2:]
return serialize_data1.reshape(theta1,theta2),serialize_data2.reshape(x1,x2)
# 用于学习参数的函数
def learning_param(theta,x,Y,R,lamda):
theta_x = serialize(theta, x) # 对theta和x进行序列化操作
# 进行梯度下降
res = minimize(fun=computeCosts, x0=theta_x, args=( Y,R, lamda), method='TNC', jac=computeGradient,
options={'maxiter': 100})
theta, x = deserialize(res.x, Y.shape[1], 10, Y.shape[0], 10) # 进行解序列化
return theta,x
lamda=1 # 设置lamda
Y, R = input_data() # 导入原始数据
# 插入一个新用户,这个用户没有做过评价
my_rating = np.zeros(Y.shape[0])
Y = np.insert(Y,Y.shape[1],my_rating,axis = 1)
R = np.insert(R,R.shape[1],my_rating,axis = 1)
u,Y = normalize(Y,R) # 对Y进行均值化
x = np.random.rand(Y.shape[0],10) # 随机初始化x
theta = np.random.rand(Y.shape[1],10) # 随机初始化y
theta,x = learning_param(theta,x,Y,R,lamda) # 学习参数
predict_Y = x@theta.T # 计算预测的评分
print('预测评分最高的十部电影序号:',np.argsort(predict_Y[:,-1])[:10]+1)
结果展示: