N, D_in, H, D_out = 64, 1000, 100, 10
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)
model = dense(D_in, H, D_out)
learning_rate=1e-4
optim = torch.optim.Adam(model.parameters(), learning_rate)
loss_fn = torch.nn.MSELoss(reduction='sum')
for t in range(500):
y_pred = model(x)
loss = loss_fn(y, y_pred)
print(t, loss.item())
optim.zero_grad()
loss.backward()
optim.step()