哈工大机器学习实验二(基于梯度下降法和牛顿法的逻辑回归)
1. 数据准备
1.1 生成数据
def Data(N, naive_bayes=True, true_rate=0.5):
number_true = np.ceil(N * true_rate).astype(np.int32)
sigma = [0.2, 0.3]
cov12 = 0.2
miu_true = [0.9, 1.1]
miu_false = [-1, -1.3]
x = np.zeros((N, 2))
y = np.zeros(N).astype(np.int32)
if naive_bayes:
x[:number_true, :] = np.random.multivariate_normal(
mean=miu_true, cov=[[sigma[0], 0], [0, sigma[1]]], size=number_true
)
x[number_true:, :] = np.random.multivariate_normal(
mean=miu_false, cov=[[sigma[0], 0], [0, sigma[1]]], size=N - number_true
)
y[:number_true] = 1
y[number_true:] = 0
else:
x[:number_true, :] = np.random.multivariate_normal(
mean=miu_true, cov=[[sigma[0], cov12], [cov12, sigma[1]]], size=number_true
)
x[number_true:, :] = np.random.multivariate_normal(
mean=miu_false,
cov=[[sigma[0], cov12], [cov12, sigma[1]]],
size=N - number_true,
)
y[:number_true] = 1
y[number_true:] = 0
return x,y
符合朴素贝叶斯假设与不符合朴素贝叶斯假设的关键:cov12是否为0,即特征之间是否相互独立
1.2 绘制生成的数据图来对比
def plot_data(x, y):
plt.figure(figsize=(8, 6))
plt.scatter(x[y == 1, 0], x[y == 1, 1], color='red', label='Class 1', alpha=0.5)
plt.scatter(x[y == 0, 0], x[y == 0, 1], color='blue', label='Class 0', alpha=0.5)
plt.xlabel('Feature 1')
plt.ylabel('Feature 2')
plt.title('Data Visualization')
plt.legend()
plt.grid(True)
plt.savefig("ml_lab2\data.png")
plt.show()
符合朴素贝叶斯假设的数据:
不符合朴素贝叶斯假设的数据:
数据点呈现 长条状 ,说明X的两个维度之间存在线性相关性
1.3 数据分割
将数据分为6:2:2的比例
def split_data(X, Y, train_size=0.6, val_size=0.2, test_size=0.2):
# 分割数据集为训练集、验证集和测试集
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, train_size=train_size)
X_val, X_test, Y_val, Y_test = train_test_split(
X_temp, Y_temp, test_size=test_size / (val_size + test_size)
)
return X_train, X_val, X_test, Y_train, Y_val, Y_test
2. 问题求解
逻辑回归的损失函数为:
l
o
s
s
(
w
)
=
∑
l
(
−
Y
l
w
T
X
l
+
ln
(
1
+
exp
(
w
T
X
l
)
)
)
+
λ
2
w
T
w
{loss}(w) = \sum_{l} \left( -Y^l {w}^T {X}^l + \ln(1 + \exp({w}^T {X}^l)) \right) + \frac{\lambda}{2} {w}^T {w}
loss(w)=l∑(−YlwTXl+ln(1+exp(wTXl)))+2λwTw
2.1 梯度下降法
2.1.1 没有惩罚项的情况
def compute_loss(X, Y, W):
m = len(Y)
predictions = sigmoid(np.dot(X, W))
loss = -np.sum(Y * np.log(predictions) + (1 - Y) * np.log(1 - predictions)) / m
return loss
def gradient_descent(
X_train, Y_train, X_test, Y_test, learning_rate=0.01, iterations=10000
):
# 初始化权重
W = np.zeros(X_train.shape[1])
m = len(Y_train)
for i in range(iterations):
predictions = sigmoid(np.dot(X_train, W))
updates = np.dot(X_train.T, predictions - Y_train) / m
W -= learning_rate * updates
# 计算测试集的准确率
predictions = predict(X_test, W)
accuracy = np.mean(predictions == Y_test) # 计算准确率
print("Accuracy:", accuracy)
# 绘制数据和决策边界
plot_data_boundary(X_test[:, 1:], Y_test, W) # 传递不包括偏置项的特征
def plot_data_boundary(x, y, W):
plt.figure(figsize=(8, 6))
plt.scatter(x[y == 1, 0], x[y == 1, 1], color="red", label="Class 1", alpha=0.5)
plt.scatter(x[y == 0, 0], x[y == 0, 1], color="blue", label="Class 0", alpha=0.5)
# 绘制决策边界
plot_x = np.linspace(x[:, 0].min() - 1, x[:, 0].max() + 1, 100)
plot_y = -(W[0] + W[1] * plot_x) / W[2]
plt.plot(plot_x, plot_y, label="Decision Boundary")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("Data Visualization with Decision Boundary")
plt.legend()
plt.grid(True)
plt.savefig("ml_lab2\data_with_boundary.png")
plt.show()
符合朴素贝叶斯假设的情况下,测试集的准确率为1.0,绘制下图
符合朴素贝叶斯假设的情况下,测试集的准确率为0.985,绘制下图
2.1.2 有惩罚项的情况
先通过验证集来找适合的正则化参数
def find_best_lambda(X_train, Y_train, X_val, Y_val, learning_rate=0.01, iterations=10000):
idx = np.linspace(-30, 5)
lambda_candidates = np.exp(idx)
losses = []
for lambd in lambda_candidates:
W = np.zeros(X_train.shape[1])
m = len(Y_train)
for i in range(iterations):
predictions = sigmoid(np.dot(X_train, W))
updates = np.dot(X_train.T, predictions - Y_train) / m
updates += lambd * W / m
W -= learning_rate * updates
# 计算验证集的损失
loss = compute_loss(X_val, Y_val, W, lambd)
losses.append(loss)
best_lambda = lambda_candidates[np.argmin(losses)]
best_loss = min(losses)
plt.figure(figsize=(10, 6))
plt.plot(idx, losses, marker='o')
plt.xlabel('Lambda')
plt.ylabel('Validation Loss')
plt.title('Validation Loss vs. Lambda')
plt.axvline(idx[np.argmin(losses)], color='r', linestyle='--', label='Best Lambda')
plt.legend()
plt.show()
return best_lambda, best_loss
选择
λ
=
e
x
p
(
−
30
)
\lambda=exp(-30)
λ=exp(−30),并在loss计算和参数更新中加入正则化参数
def compute_loss(X, Y, W, lambd=0):
m = len(Y)
predictions = sigmoid(np.dot(X, W))
loss = -np.sum(Y * np.log(predictions) + (1 - Y) * np.log(1 - predictions)) / m
loss += lambd * np.sum(W**2) / (2 * m)
return loss
def gradient_descent(
X_train, Y_train, X_test, Y_test, learning_rate=0.01, iterations=10000, lambd=0
):
# 初始化权重
W = np.zeros(X_train.shape[1])
m = len(Y_train)
for i in range(iterations):
predictions = sigmoid(np.dot(X_train, W))
updates = np.dot(X_train.T, predictions - Y_train) / m
updates += lambd * W / m
W -= learning_rate * updates
# 计算测试集的准确率
predictions = predict(X_test, W)
accuracy = np.mean(predictions == Y_test) # 计算准确率
print("Accuracy:", accuracy)
# 绘制数据和决策边界
plot_data_boundary(X_test[:, 1:], Y_test, W) # 传递不包括偏置项的特征
符合朴素贝叶斯假设的情况下,测试集的准确率为0.998,绘制下图
符合朴素贝叶斯假设的情况下,测试集的准确率为0.992,绘制下图
2.2 牛顿法
2.2.1 公式推导
针对无条件极值/最优化问题: min x f ( x ) \min_{x} f(x) minxf(x)
假设
f
(
x
)
f(x)
f(x) 有二阶连续偏导数,若第
k
k
k 次迭代值为
x
k
x^k
xk,则可以将
f
(
x
)
f(x)
f(x) 在
x
k
x^k
xk 附近进行二阶泰勒展开:
f
(
x
)
=
f
(
x
k
)
+
g
k
T
(
x
−
x
k
)
+
1
2
(
x
−
x
k
)
T
H
(
x
k
)
(
x
−
x
k
)
f(x) = f(x^k) + g_k^T (x - x^k) + \frac{1}{2} (x - x^k)^T H(x^k) (x - x^k)
f(x)=f(xk)+gkT(x−xk)+21(x−xk)TH(xk)(x−xk)
其中,
g
k
=
∇
f
(
x
)
g_k = \nabla f(x)
gk=∇f(x) 是梯度向量,
H
(
x
)
H(x)
H(x) 是黑塞矩阵:
H
(
x
)
=
[
∂
2
f
∂
x
i
∂
x
j
]
n
×
n
H(x) = \left[ \frac{\partial^2 f}{\partial x_i \partial x_j} \right]_{n \times n}
H(x)=[∂xi∂xj∂2f]n×n
在极值点处 g k = 0 g_k = 0 gk=0;其中,当 H ( x ) H(x) H(x) 正定时, f ( x ) f(x) f(x) 的极值是极小值。
为了得到
g
k
=
0
g_k = 0
gk=0 的点,对展开式求导:
d
f
(
x
)
d
x
=
g
k
+
H
k
(
x
−
x
k
)
\frac{df(x)}{dx} = g_k + H_k (x - x^k)
dxdf(x)=gk+Hk(x−xk)
则极值点处:
d
f
(
x
)
d
x
=
0
\frac{df(x)}{dx} = 0
dxdf(x)=0
x
k
+
1
=
x
k
−
H
k
−
1
g
k
x^{k+1} = x^k - H_k^{-1} g_k
xk+1=xk−Hk−1gk
得到迭代公式。
将其应用到我们的损失函数就是:
w
k
+
1
=
w
k
−
(
∂
2
l
o
s
s
(
w
)
∂
w
∂
w
T
)
−
1
∂
l
o
s
s
(
w
)
∂
w
w^{k+1} = w^k - \left(\frac{\partial^2 loss(w)}{\partial w \partial w^T}\right)^{-1} \frac{\partial loss(w)}{\partial w}
wk+1=wk−(∂w∂wT∂2loss(w))−1∂w∂loss(w)
我们的损失函数为:
l
o
s
s
(
w
)
=
∑
l
(
−
Y
l
w
T
X
l
+
ln
(
1
+
exp
(
w
T
X
l
)
)
)
+
λ
2
w
T
w
{loss}(w) = \sum_{l} \left( -Y^l {w}^T {X}^l + \ln(1 + \exp({w}^T {X}^l)) \right) + \frac{\lambda}{2} {w}^T {w}
loss(w)=l∑(−YlwTXl+ln(1+exp(wTXl)))+2λwTw
所以得到:
∂
l
o
s
s
(
w
)
∂
w
=
−
∑
l
x
l
(
Y
l
−
σ
(
w
T
X
)
)
+
λ
w
\frac{\partial loss(w)}{\partial w} = -\sum_l x^l (Y^l - \sigma(w^T X)) + \lambda w
∂w∂loss(w)=−l∑xl(Yl−σ(wTX))+λw
∂
2
l
o
s
s
(
w
)
∂
w
∂
w
T
=
∑
l
(
X
X
T
σ
(
w
T
X
)
σ
(
−
w
T
X
)
)
+
λ
I
\frac{\partial^2 loss(w)}{\partial w \partial w^T} = \sum_l (X X^T \sigma(w^T X) \sigma(-w^T X)) + \lambda I
∂w∂wT∂2loss(w)=l∑(XXTσ(wTX)σ(−wTX))+λI
这里直接写一个牛顿法的类,代码如下:
class LogisticRegressionNewton:
def __init__(self, X, y, hyper=0.01, w_0=None, delta=1e-6):
self.x = X
self.y = y
self.__m = len(y)
self.__n = X.shape[1]
self.hyper = hyper
if w_0 is None:
self.w_0 = np.zeros(self.__n)
else:
self.w_0 = w_0
self.delta = delta
def __derivative(self, w):
result = np.zeros(self.__n)
for i in range(self.__m):
result += self.x[i] * (self.y[i] - sigmoid(w @ self.x[i]))
return -1 * result + self.hyper * w
def __second_derivative(self, w):
ans = np.eye(self.__n) * self.hyper
for i in range(self.__m):
temp = sigmoid(w @ self.x[i])
ans += self.x[i] * np.transpose([self.x[i]]) * temp * (1 - temp)
return np.linalg.pinv(ans)
def fit(self):
w = self.w_0
while True:
gradient = self.__derivative(w)
if np.linalg.norm(gradient) < self.delta:
break
hessian_inv = self.__second_derivative(w)
w = w - hessian_inv @ gradient
self.w_0 = w
return w
def predict(self, X):
return sigmoid(np.dot(X, self.w_0)) >= 0.5
2.2.2 没有惩罚项的情况
符合朴素贝叶斯假设的情况下,测试集的准确率为0.998,绘制下图
符合朴素贝叶斯假设的情况下,测试集的准确率为0.987,绘制下图
2.2.3 有惩罚项的情况
符合朴素贝叶斯假设的情况下,测试集的准确率为0.999,绘制下图
符合朴素贝叶斯假设的情况下,测试集的准确率为0.991,绘制下图
2.3 运行代码
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
def Data(N, naive_bayes=True, true_rate=0.5):
number_true = np.ceil(N * true_rate).astype(np.int32)
sigma = [0.2, 0.3]
cov12 = 0.2
miu_true = [0.9, 1.1]
miu_false = [-1, -1.3]
x = np.zeros((N, 2))
y = np.zeros(N).astype(np.int32)
if naive_bayes:
x[:number_true, :] = np.random.multivariate_normal(
mean=miu_true, cov=[[sigma[0], 0], [0, sigma[1]]], size=number_true
)
x[number_true:, :] = np.random.multivariate_normal(
mean=miu_false, cov=[[sigma[0], 0], [0, sigma[1]]], size=N - number_true
)
y[:number_true] = 1
y[number_true:] = 0
else:
x[:number_true, :] = np.random.multivariate_normal(
mean=miu_true, cov=[[sigma[0], cov12], [cov12, sigma[1]]], size=number_true
)
x[number_true:, :] = np.random.multivariate_normal(
mean=miu_false,
cov=[[sigma[0], cov12], [cov12, sigma[1]]],
size=N - number_true,
)
y[:number_true] = 1
y[number_true:] = 0
return x, y
def split_data(X, Y, train_size=0.6, val_size=0.2, test_size=0.2):
# 分割数据集为训练集、验证集和测试集
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, train_size=train_size)
X_val, X_test, Y_val, Y_test = train_test_split(
X_temp, Y_temp, test_size=test_size / (val_size + test_size)
)
return X_train, X_val, X_test, Y_train, Y_val, Y_test
def plot_data(x, y):
plt.figure(figsize=(8, 6))
plt.scatter(x[y == 1, 0], x[y == 1, 1], color="red", label="Class 1", alpha=0.5)
plt.scatter(x[y == 0, 0], x[y == 0, 1], color="blue", label="Class 0", alpha=0.5)
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("Data Visualization")
plt.legend()
plt.grid(True)
plt.savefig("ml_lab2\data.png")
plt.show()
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def predict(X, W):
return sigmoid(np.dot(X, W)) >= 0.5
def compute_loss(X, Y, W, lambd=0):
m = len(Y)
predictions = sigmoid(np.dot(X, W))
loss = -np.sum(Y * np.log(predictions) + (1 - Y) * np.log(1 - predictions)) / m
loss += lambd * np.sum(W**2) / (2 * m)
return loss
def gradient_descent(
X_train, Y_train, X_test, Y_test, learning_rate=0.01, iterations=10000, lambd=0
):
# 初始化权重
W = np.zeros(X_train.shape[1])
m = len(Y_train)
for i in range(iterations):
predictions = sigmoid(np.dot(X_train, W))
updates = np.dot(X_train.T, predictions - Y_train) / m
updates += lambd * W / m
W -= learning_rate * updates
# 计算测试集的准确率
predictions = predict(X_test, W)
accuracy = np.mean(predictions == Y_test) # 计算准确率
print("Accuracy:", accuracy)
# 绘制数据和决策边界
plot_data_boundary(X_test[:, 1:], Y_test, W) # 传递不包括偏置项的特征
def find_best_lambda(
X_train, Y_train, X_val, Y_val, learning_rate=0.01, iterations=10000
):
idx = np.linspace(-30, 5)
lambda_candidates = np.exp(idx)
losses = []
for lambd in lambda_candidates:
W = np.zeros(X_train.shape[1])
m = len(Y_train)
for i in range(iterations):
predictions = sigmoid(np.dot(X_train, W))
updates = np.dot(X_train.T, predictions - Y_train) / m
updates += lambd * W / m
W -= learning_rate * updates
# 计算验证集的损失
loss = compute_loss(X_val, Y_val, W, lambd)
losses.append(loss)
best_lambda = lambda_candidates[np.argmin(losses)]
best_loss = min(losses)
plt.figure(figsize=(10, 6))
plt.plot(idx, losses, marker="o")
plt.xlabel("Lambda")
plt.ylabel("Validation Loss")
plt.title("Validation Loss vs. Lambda")
plt.axvline(idx[np.argmin(losses)], color="r", linestyle="--", label="Best Lambda")
plt.legend()
plt.show()
return best_lambda, best_loss
def plot_data_boundary(x, y, W):
plt.figure(figsize=(8, 6))
plt.scatter(x[y == 1, 0], x[y == 1, 1], color="red", label="Class 1", alpha=0.5)
plt.scatter(x[y == 0, 0], x[y == 0, 1], color="blue", label="Class 0", alpha=0.5)
# 绘制决策边界
plot_x = np.linspace(x[:, 0].min() - 1, x[:, 0].max() + 1, 1000)
plot_y = -(W[0] + W[1] * plot_x) / W[2]
plt.plot(plot_x, plot_y, label="Decision Boundary")
plt.xlabel("Feature 1")
plt.ylabel("Feature 2")
plt.title("Data Visualization with Decision Boundary")
plt.legend()
plt.grid(True)
plt.savefig("ml_lab2\data_with_boundary.png")
plt.show()
class LogisticRegressionNewton:
def __init__(self, X, y, hyper=0.01, w_0=None, delta=1e-6):
self.x = X
self.y = y
self.__m = len(y)
self.__n = X.shape[1]
self.hyper = hyper
if w_0 is None:
self.w_0 = np.zeros(self.__n)
else:
self.w_0 = w_0
self.delta = delta
def __derivative(self, w):
result = np.zeros(self.__n)
for i in range(self.__m):
result += self.x[i] * (self.y[i] - sigmoid(w @ self.x[i]))
return -1 * result + self.hyper * w
def __second_derivative(self, w):
ans = np.eye(self.__n) * self.hyper
for i in range(self.__m):
temp = sigmoid(w @ self.x[i])
ans += self.x[i] * np.transpose([self.x[i]]) * temp * (1 - temp)
return np.linalg.pinv(ans)
def fit(self):
w = self.w_0
while True:
gradient = self.__derivative(w)
if np.linalg.norm(gradient) < self.delta:
break
hessian_inv = self.__second_derivative(w)
w = w - hessian_inv @ gradient
self.w_0 = w
return w
def predict(self, X):
return sigmoid(np.dot(X, self.w_0)) >= 0.5
if __name__ == "__main__":
N = 5000 # 数据点的数量
naive_bayes = True # 是否使用朴素贝叶斯假设
true_rate = 0.5 # 正例的比例
X, Y = Data(N, naive_bayes, true_rate)
# 添加偏置项
X = np.hstack((np.ones((N, 1)), X)) # 添加第一列的1
# 分割数据集
X_train, X_val, X_test, Y_train, Y_val, Y_test = split_data(X, Y)
# 寻找最佳的lambda
# best_lambda, best_loss = find_best_lambda(X_train, Y_train, X_val, Y_val)
# 执行梯度下降
# gradient_descent(X_train, Y_train, X_test, Y_test,np.exp(-30))
# 创建逻辑回归模型实例
model = LogisticRegressionNewton(X_train, Y_train)
# 训练模型
W = model.fit()
plot_data_boundary(X_test[:, 1:], Y_test, W)
# 计算测试集的准确率
predictions = model.predict(X_test)
accuracy = np.mean(predictions == Y_test)
print("Test Accuracy:", accuracy)
3. UCI数据集的应用
3.1 数据集介绍
UCI成年人收入数据集(Adult Income Dataset)是一个广泛用于机器学习和数据挖掘领域的数据集,特别是在分类和预测建模任务中。该数据集的目标是预测一个成年人(基于1994年的人口普查数据库)的年收入是否超过50,000美元。这是一个典型的二分类问题,其中收入超过50K的个体被标记为正类(1),而收入不超过50K的个体被标记为负类(0)。
以下是该数据集的一些关键特点:
数据来源:数据集来源于1994年的美国人口普查数据库。
特征:数据集包含14个特征,包括:
年龄(age)
工作类别(workclass)
权重(fnlwgt)
教育水平(education)
教育年数(education-num)
婚姻状况(marital-status)
职业(occupation)
与户主关系(relationship)
种族(race)
性别(sex)
资本收益(capital-gain)
资本损失(capital-loss)
每周工作小时数(hours-per-week)
出生地(native-country)
类别:目标变量是“收入”,分为两类:
收入超过50K美元(>50K)
收入不超过50K美元(≤50K)
数据规模:数据集包含约32,000个样本(个体)。
UCI成年人收入数据集是机器学习领域的一个标准基准数据集,它提供了一个相对简单但实际的问题,用于评估和比较不同的算法性能。
3.2 UCI运行代码
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# 定义sigmoid函数,添加clip防止溢出
def sigmoid(x):
x = np.clip(x, -500, 500) # 限制x的值,防止溢出
return 1 / (1 + np.exp(-x))
# 定义计算损失的函数,添加epsilon避免对数计算问题
def compute_loss(X, Y, W, lambd=0, epsilon=1e-10):
m = len(Y)
predictions = sigmoid(np.dot(X, W))
predictions = np.clip(predictions, epsilon, 1 - epsilon) # 限制predictions的值,避免对数计算问题
loss = -np.sum(Y * np.log(predictions) + (1 - Y) * np.log(1 - predictions)) / m
loss += lambd * np.sum(W**2) / (2 * m)
return loss
# 定义梯度下降函数
def gradient_descent(X_train, Y_train, learning_rate=0.01, iterations=10000, lambd=0):
W = np.zeros(X_train.shape[1])
m = len(Y_train)
for i in range(iterations):
predictions = sigmoid(np.dot(X_train, W))
predictions = np.clip(predictions, 1e-10, 1 - 1e-10) # 同上
gradient = np.dot(X_train.T, (predictions - Y_train)) / m
gradient += lambd * W / m
W -= learning_rate * gradient
return W
# 定义预测函数
def predict(X, W):
return sigmoid(np.dot(X, W)) >= 0.5
# 定义计算准确率的函数
def accuracy(Y, predictions):
return np.mean(predictions == Y)
# 加载数据集
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
column_names = [
"age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
"occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
"hours-per-week", "native-country", "income"
]
data = pd.read_csv(url, names=column_names, sep=',\s', na_values=["?"], engine='python')
# 数据预处理
# 将类别型数据转换为数值型数据
label_encoders = {}
for column in data.columns:
le = LabelEncoder()
data[column] = le.fit_transform(data[column])
label_encoders[column] = le
# 处理缺失值
data = data.dropna()
# 定义特征和标签
X = data.drop('income', axis=1).values
Y = (data['income'] == ' >50K').astype(int).values
# 添加偏置项
X = np.hstack((np.ones((X.shape[0], 1)), X))
# 分割数据集为训练集和测试集
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# 执行梯度下降
W = gradient_descent(X_train, Y_train, learning_rate=0.01, iterations=10000, lambd=0.01)
# 预测测试集
predictions = predict(X_test, W)
# 计算准确率
accuracy = accuracy(Y_test, predictions)
print("Test Accuracy:", accuracy)
# 计算测试集的损失
loss = compute_loss(X_test, Y_test, W, lambd=0.01)
print("Test Loss:", loss)
运行得到的结果是:
Test Accuracy: 1.0
Test Loss: 0.0021162482132220067