import numpy as np
N, D_in, H, D_out = 64, 1000, 100, 10
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)
lr = 1e-6
for i in range(500):
# forward pass
hidden_act = x.dot(w1) # N*H
hidden_relu = np.maximum(0, hidden_act)
out_layer = hidden_relu.dot(w2) #N*D_out
# loss
loss = np.square(y-out_layer).sum()
print(i, loss)
# backward pass
grad_out_layer = 2 * (out_layer-y) #N*D_out
grad_w2 = hidden_relu.T.dot(grad_out_layer) #H*D_out
grad_hidden_relu = grad_out_layer.dot(w2.T)
grad_h = grad_hidden_relu.copy()
grad_h[hidden_act<0] = 0
grad_w1 = x.T.dot(grad_h)
#update
w1 -= lr * grad_w1
w2 -= lr * grad_w2