目录
hw0
Question 1: A basic add function, and testing/autograding basics
def add(x, y):
return x + y
Question 2: Loading MNIST data
def parse_mnist(image_filename, label_filename):
with gzip.open(image_file, 'rb') as f:
img_magic, img_num, img_w, img_h = struct.unpack('>IIII', f.read(16))
imgs = np.frombuffer(f.read(img_num * img_h * img_w), dtype=np.uint8).reshape(img_num, img_w*img_h).astype(np.float32)/255
with gzip.open(label_filename, 'rb') as f:
labels_magic, labels_num = struct.unpack('>II', f.read(8))
labels = np.frombuffer(f.read(labels_num), dtype=np.uint8)
return imgs, labels
Note:
- gzip.open : 打开以.gz为后缀的文件,按字节读取。
- struct.unpack : 其中 I 为4字节无符号整数,4个 I就是16字节,>大端读取。
- np.frombuffer : 从缓冲区中读取指定字节的数据。
- mnist file format :
- image : 前面16个字节,以每4个字节划分,分别代表magic,图片总数量, 单张图片的宽度,单张图片的高度。后面的字节就是图片的数据。
- label : 前面8个字节,同理,分别代表magic,标签的总数量,后面为标签的数据。
Question 3: Softmax loss
先在外部定义一个softmax函数
def softmax(x):
return np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
def softmax_loss(Z, y):
return np.mean(-np.log(softmax(Z)[np.indices(y.shape)[0], y])) # np.indices 返回给定形状数组的元素索引序号
Question 4: Stochastic gradient descent for softmax regression
def softmax_regression_epoch(X, y, theta, lr=0.1, batch=100):
# X: m x n, theta: n x k, y : m x 1
# x: batch x n, y_hat: batch x 1
# Z: batch x kc
for i in range(X.shape[0]//batch): # 以batch为批次,分为 m//batch 批
x = X[0+i*batch:batch+i*batch] # 每次截取batch长度
y_hat = y[0+i*batch:batch+i*batch]
Z = softmax(np.matmul(x, theta))
Z[np.arange(batch), y_hat] -= 1 # 将每行预测类别与真实类别一致的索引处的值-1
grad = np.matmul(x.transpose(), Z) / batch
theta -= lr * grad
Question 5: SGD for a two-layer neural network
for i in range(X.shape[0]//batch):
x = X[i*batch:batch+i*batch]
y_hat = y[i*batch:batch+i*batch]
z = np.matmul(x, W1)
np.maximum(0, z, z) # ReLU(XW1)
G2 = softmax(np.matmul(z, W2))
G2[np.arange(batch), y_hat] -= 1
G1 = np.multiply(np.where(z>0, 1, z),(G2 @ W2.transpose()))
W1 -= lr/batch * (x.transpose() @ G1)
W2 -= lr/batch * (z.transpose() @ G2)
Question 6: Softmax regression in C++
void mat_mul(const float* X, const float* Y, float* Z, size_t m, size_t n, size_t k)
{
for(size_t i=0; i<m; ++i)
{
for(size_t j=0; j<k; ++j)
{
Z[i*k+j] = 0;
for(size_t l=0; l<n; ++l)
Z[i*k+j] += X[i*n+l]*Y[l*k+j];
}
}
}
for(size_t b=0; b<ceil(m/batch); ++b)
{
float* Z = new float[b*k];
const float* x = &X[b*batch*n];
mat_mul(X, theta, Z, m, n, k);
for(size_t i=0; i<b*k; ++i) Z[i]=exp(Z[i]); //exp(Z)
for(size_t i=0; i<b; ++i)
{
float sum = 0;
for(size_t j=0; j<k; ++j)
sum += Z[i*k+j];
for(size_t j=0; j<k; ++j)
Z[i*k+j] /= sum; //normalize(Z)
}
for(size_t i=0; i<b; ++i)
Z[i*k+y[b*batch+i]] -= 1; //Z-y
float* grad = new float[n*k];
float* x_T = new float[n*b];
for(size_t i=0; i<b; ++i)
for(size_t j=0; j<n; ++j)
x_T[j*b+i] = x[i*n+j];
mat_mul(x_T, Z, grad, n, b,k);
for(size_t i=0; i<n*k; ++i)
theta[i] -= lr/batch * grad[i];
delete[] Z;
delete[] x_T;
delete[] grad;
}
ps:未完待续。。。 完结