作业2 台大李宏毅——Classification (分类)--1

首先分享找到的关于作业2的说明PDF和主教给的例程。
链接:https://pan.baidu.com/s/1VoWKz2cOTrVFnQkSQjIrKA
提取码:njz3
其中.ipynb文件用Jupyter Notebook打开,阅读比较方便。
下面的链接是作业中用到的数据集:
链接:https://pan.baidu.com/s/1HpXeECc1ay_a2FP76SaX4w
提取码:mneu
最后放上助教对本次课程要求的讲解视频链接:https://www.bilibili.com/video/av837629579?p=2

内容提要

首先奉上助教的例程汇总;
其次,将自己的程序分段讲述并汇总。

助教例程

import numpy as np

np.random.seed(0)
X_train_fpath = '../data/hw2/X_train'
Y_train_fpath = '../data/hw2/Y_train'
# X_test_fpath = './data/X_test'
# output_fpath = './output_{}.csv'

# Parse csv files to numpy array
with open(X_train_fpath) as f:
	next(f)
	X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype=float)
with open(Y_train_fpath) as f:
	next(f)
	Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype=float)
# with open(X_test_fpath) as f:
# 	next(f)
# 	X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype=float)


def _normalize(X, train=True, specified_column=None, X_mean=None, X_std=None):
	# This function normalizes specific columns of X.
	# The mean and standard variance of training data will be reused when processing testing data.
	#
	# Arguments:
	#     X: data to be processed
	#     train: 'True' when processing training data, 'False' for testing data
	#     specific_column: indexes of the columns that will be normalized. If 'None', all columns
	#         will be normalized.
	#     X_mean: mean value of training data, used when train = 'False'
	#     X_std: standard deviation of training data, used when train = 'False'
	# Outputs:
	#     X: normalized data
	#     X_mean: computed mean value of training data
	#     X_std: computed standard deviation of training data

	if specified_column == None:
		specified_column = np.arange(X.shape[1])
	if train:
		X_mean = np.mean(X[:, specified_column], 0).reshape(1, -1)
		X_std = np.std(X[:, specified_column], 0).reshape(1, -1)

	X[:, specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)

	return X, X_mean, X_std


def _shuffle(X, Y):
	# This function shuffles two equal-length list/array, X and Y, together.
	randomize = np.arange(len(X))
	np.random.shuffle(randomize)
	return (X[randomize], Y[randomize])


def _train_dev_split(X, Y, dev_ratio=0.25):
	# This function spilts data into training set and development set.
	X, Y = _shuffle(X, Y)
	per = np.random.permutation(X.shape[0])
	data_new = X[per, :]
	train_size = int(len(X) * (1 - dev_ratio))
	return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]


# Normalize training and testing data
X_train, X_mean, X_std = _normalize(X_train, train=True)
# X_test, _, _ = _normalize(X_test, train=False, specified_column=None, X_mean=X_mean, X_std=X_std)

# Split data into training set and development set
dev_ratio = 0.1
X_train, Y_train, X_dev, Y_dev = _train_dev_split(X_train, Y_train, dev_ratio=dev_ratio)

train_size = X_train.shape[0]
dev_size = X_dev.shape[0]
# test_size = X_test.shape[0]
data_dim = X_train.shape[1]
print('Size of training set: {}'.format(train_size))
print('Size of development set: {}'.format(dev_size))
# print('Size of testing set: {}'.format(test_size))

def _sigmoid(z):
	# Sigmoid function can be used to calculate probability.
	# To avoid overflow, minimum/maximum output value is set.
	return np.clip(1 / (1.0 + np.exp(-z)), 1e-8, 1 - (1e-8))


print('Dimension of data: {}'.format(data_dim))


def _shuffle(X, Y):
	# This function shuffles two equal-length list/array, X and Y, together.
	randomize = np.arange(len(X))
	np.random.shuffle(randomize)
	return (X[randomize], Y[randomize])


def _sigmoid(z):
	# Sigmoid function can be used to calculate probability.
	# To avoid overflow, minimum/maximum output value is set.
	return np.clip(1 / (1.0 + np.exp(-z)), 1e-8, 1 - (1e-8))


def _f(X, w, b):
	# This is the logistic regression function, parameterized by w and b
	#
	# Arguements:
	#     X: input data, shape = [batch_size, data_dimension]
	#     w: weight vector, shape = [data_dimension, ]
	#     b: bias, scalar
	# Output:
	#     predicted probability of each row of X being positively labeled, shape = [batch_size, ]
	return _sigmoid(np.matmul(X, w) + b)


def _predict(X, w, b):
	# This function returns a truth value prediction for each row of X
	# by rounding the result of logistic regression function.
	return np.round(_f(X, w, b)).astype(np.int)


def _accuracy(Y_pred, Y_label):
	# This function calculates prediction accuracy
	acc = 1 - np.mean(np.abs(Y_pred - Y_label))
	return acc

def _cross_entropy_loss(y_pred, Y_label):
    # This function computes the cross entropy.
    #
    # Arguements:
    #     y_pred: probabilistic predictions, float vector
    #     Y_label: ground truth labels, bool vector
    # Output:
    #     cross entropy, scalar
    cross_entropy = -np.dot(Y_label, np.log(y_pred)) - np.dot((1 - Y_label), np.log(1 - y_pred))
    return cross_entropy

def _gradient(X, Y_label, w, b):
    # This function computes the gradient of cross entropy loss with respect to weight w and bias b.
    y_pred = _f(X, w, b)
    pred_error = Y_label - y_pred
    w_grad = -np.sum(pred_error * X.T, 1)
    b_grad = -np.sum(pred_error)
    return w_grad, b_grad


# Zero initialization for weights ans bias
w = np.zeros((data_dim,))
b = np.zeros((1,))

# Some parameters for training
max_iter = 10
batch_size = 8
learning_rate = 0.2

# Keep the loss and accuracy at every iteration for plotting
train_loss = []
dev_loss = []
train_acc = []
dev_acc = []

# Calcuate the number of parameter updates
step = 1

# Iterative training
for epoch in range(max_iter):
	# Random shuffle at the begging of each epoch
	X_train, Y_train = _shuffle(X_train, Y_train)

	# Mini-batch training
	for idx in range(int(np.floor(train_size / batch_size))):
		X = X_train[idx * batch_size:(idx + 1) * batch_size]
		Y = Y_train[idx * batch_size:(idx + 1) * batch_size]

		# Compute the gradient
		w_grad, b_grad = _gradient(X, Y, w, b)

		# gradient descent update
		# learning rate decay with time
		w = w - learning_rate / np.sqrt(step) * w_grad
		b = b - learning_rate / np.sqrt(step) * b_grad

		step = step + 1

	# Compute loss and accuracy of training set and development set
	y_train_pred = _f(X_train, w, b)
	Y_train_pred = np.round(y_train_pred)
	train_acc.append(_accuracy(Y_train_pred, Y_train))
	train_loss.append(_cross_entropy_loss(y_train_pred, Y_train) / train_size)

	y_dev_pred = _f(X_dev, w, b)
	Y_dev_pred = np.round(y_dev_pred)
	dev_acc.append(_accuracy(Y_dev_pred, Y_dev))
	dev_loss.append(_cross_entropy_loss(y_dev_pred, Y_dev) / dev_size)

print('Training loss: {}'.format(train_loss[-1]))
print('Development loss: {}'.format(dev_loss[-1]))
print('Training accuracy: {}'.format(train_acc[-1]))
print('Development accuracy: {}'.format(dev_acc[-1]))


import matplotlib.pyplot as plt

# Loss curve
plt.ion()
plt.figure()
plt.plot(train_loss)
plt.plot(dev_loss)
plt.title('Loss')
plt.legend(['train', 'dev'])
plt.savefig('loss.png')
plt.show()

# Accuracy curve
plt.figure()
plt.plot(train_acc)
plt.plot(dev_acc)
plt.title('Accuracy')
plt.legend(['train', 'dev'])
plt.savefig('acc.png')
plt.ioff()
plt.show()


# Size of training set: 48830
# Size of development set: 5426
# Dimension of data: 510
# Training loss: 0.27383248167901403
# Development loss: 0.2826270940454571
# Training accuracy: 0.8835142330534508
# Development accuracy: 0.8813122005160339

最后,可以看到train loss为0.2738,test loss为0.2826,想差不多。正确率上也想差不多,分别是88.35%和88.13%。
不过,仔细阅读助教程序发现一点,助教对所有数据统一进行了normalize之后才划分的train和development data。这有可能会导致训练集内包含一丢丢测试数据信息。所以,在划分训练集和测试集时,应该在数据最初即进行划分,任何操作都有可能导致网络对未知数据的不稳定。
下面是两组数据的loss和accuracy曲线。
在这里插入图片描述
在这里插入图片描述

自己的程序

数据获取

import numpy as np
import matplotlib.pyplot as plt
# import os

# np.random.seed(0)
X_train_fpath = '../data/hw2/X_train'
Y_train_fpath = '../data/hw2/Y_train'
# X_test_fpath = './data/X_test'
# output_fpath = './output_{}.csv'

with open(X_train_fpath) as f:
    next(f)
    X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype=float)
with open(Y_train_fpath) as f:
    next(f)
    Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype=float)

# print(X_train.shape)  # (54256, 510)
# print(Y_train.shape)  # (54256,) 其中11151个1,其他为0.
# print(X_train[10, 0], train_data[10, 0])
# print(sum(Y_train))

data_set

自己写了data_set函数,在划分训练和测试集时对原数据先进行随机化,为了保证数据的公平。

def _normalize(data_nl, train_nl=True, specified_column=None, data_mean_nl=None, data_std_nl=None):
    # This function normalizes specific columns of X.
    # The mean and standard variance of training data will be reused when processing testing data.
    #
    # Arguments:
    #     X: data to be processed
    #     train: 'True' when processing training data, 'False' for testing data
    #     specific_column: indexes of the columns that will be normalized. If 'None', all columns
    #         will be normalized.
    #     X_mean: mean value of training data, used when train = 'False'
    #     X_std: standard deviation of training data, used when train = 'False'
    # Outputs:
    #     X: normalized data
    #     X_mean: computed mean value of training data
    #     X_std: computed standard deviation of training data

    if specified_column is None:
        specified_column = np.arange(data_nl.shape[1])
    if train_nl:
        data_mean_nl = np.mean(data_nl[:, specified_column], 0).reshape(1, -1)
        data_std_nl = np.std(data_nl[:, specified_column], 0).reshape(1, -1)

    data_nl[:, specified_column] = (data_nl[:, specified_column] - data_mean_nl) / (data_std_nl + 1e-8)

    return data_nl, data_mean_nl, data_std_nl


def train_dev_split(inputdata, inputlabels, dev_ratio_sp=0.2, permutation=True):
    # This function spilts data into training set and development set.
    if permutation:
        per = np.random.permutation(inputdata.shape[0])
        data_new = inputdata[per, :]
        labels_new = inputlabels[per]
    else:
        data_new = inputdata
        labels_new = inputlabels
    train_size = int(len(data_new) * (1 - dev_ratio_sp))

    return data_new[:train_size], labels_new[:train_size], data_new[train_size:], labels_new[train_size:]


def data_set(data_ds, labels_ds, dev_ratio=0.1, permutation=True, is_print=True):
    # Split data into training set and public set,
    # this is the FIRST step. Ensure that the train data does not contain public data information
    data_train_ds, train_labels_ds, data_public_ds, public_labels_ds = \
        train_dev_split(data_ds, labels_ds, dev_ratio_sp=dev_ratio, permutation=permutation)

    # Normalize training and public data
    train_data_ds, train_mean_ds, train_std_ds = _normalize(data_train_ds, train_nl=True)
    public_data_ds, _, _ = \
        _normalize(data_public_ds, train_nl=False, data_mean_nl=train_mean_ds, data_std_nl=train_std_ds)

    # is_print
    if is_print:
        print('Size of training set: {}'.format(train_data_ds.shape[0]))
        print('Size of public set: {}'.format(public_data_ds.shape[0]))
        print('Dimension of data: {}'.format(train_data_ds.shape[1]))

    return train_data_ds, train_labels_ds, public_data_ds, public_labels_ds

train

采用分批梯度下降的方式进行训练,由于逻辑回归的偏导数也是y-y’,所以这里训练过程还是相对简单的。
train的learning rate设置上有所区别,在助教的训练过程中对learning rate进行了逐步缩小的操作,即除以迭代次数的开方。我尝试了两种方式,对learning rate初值设计好,两种迭代结果区别不大。

目前learning_rate=0.0002, batch_size=10, epochs=10的这个参数组合比较好,但是,我绘制了每次小批量的loss曲线,不论参数如何,这个曲线都是在振荡的,这里不明白其原因。
在这里插入图片描述在这里插入图片描述
上两图中,改变batch size曲线振荡减小,这个可以理解,及单次求取交叉熵的量变多,自然结果就更稳定了。但是,我的问题是,在迭代末期,对少量样本的loss振荡如此严重的网络,为什么对总体的效果还是比较好的?这个网络值得信赖吗?

def _shuffle(inputdata, outputdata):
    # This function shuffles two equal-length list/array, X and Y, together.
    randomize = np.arange(len(inputdata))
    np.random.shuffle(randomize)
    return inputdata[randomize], outputdata[randomize]
    
def _sigmoid(z):
    # Sigmoid function can be used to calculate probability.
    # To avoid overflow, minimum/maximum output value is set.
    return np.clip(1 / (1.0 + np.exp(-z)), 1e-8, 1 - 1e-8)

def _accuracy(y_pred, y_label):
    # This function calculates prediction accuracy
    acc = 1 - np.mean(np.abs(y_pred - y_label))
    return acc


def _cross_entropy_loss(y_pred, y_label):
    # This function computes the cross entropy.
    #
    # Arguements:
    #     y_pred: probabilistic predictions, float vector
    #     Y_label: ground truth labels, bool vector
    # Output:
    #     cross entropy, scalar
    cross_entropy = -np.dot(y_label, np.log(y_pred)) - np.dot((1 - y_label), np.log(1 - y_pred))
    return cross_entropy


def _gradient(data_grad, label_grad, y_grad):
    # This function computes the gradient of cross entropy loss with respect to weight w and bias b.
    pred_error = label_grad - y_grad
    w_gd = -np.sum(pred_error * data_grad.T, 1)
    b_gd = -np.sum(pred_error)
    return w_gd, b_gd


def train(train_data_tr, train_labels_tr, learning_rate=0.0002, batch_size=10, epochs=10):  #
    data_dim = train_data_tr.shape[1]
    w_tr = np.zeros([data_dim])
    b_tr = np.zeros([1])
    idxs = int(np.floor(train_data_tr.shape[0] / batch_size))
    print('idxs:{}'.format(idxs))

    train_loss_epidx_tr = np.empty([epochs*idxs])
    train_loss_ep_tr = np.empty([epochs])
    # train_y_tr = np.empty([len(train_labels_tr)])
    train_acc_tr = 0
    step = 1

    for epoch in range(epochs):
        train_data_tr, train_labels_tr = _shuffle(train_data_tr, train_labels_tr)

        for idx in range(idxs):
            train_data_bc = train_data_tr[idx * batch_size:(idx + 1) * batch_size]
            train_labels_bc = train_labels_tr[idx * batch_size:(idx + 1) * batch_size]
            # print('train_data_bc size{}'.format(train_data_bc.shape))
            train_y_bc = _sigmoid(np.dot(train_data_bc, w_tr) + b_tr)
            # print('train_y_bc size {}'.format(train_y_bc))
            # train_loss_tr = _cross_entropy_loss(train_y_bc, train_labels_bc)
            # print(train_loss_tr)
            w_grad, b_grad = _gradient(train_data_bc, train_labels_bc, train_y_bc)

            w_tr = w_tr - learning_rate * w_grad
            b_tr = b_tr - learning_rate * b_grad
            # w_tr = w_tr - learning_rate / np.sqrt(step) * w_grad
            # b_tr = b_tr - learning_rate / np.sqrt(step) * b_grad
            step += 1

            train_loss_epidx_tr[epoch * idxs + idx] = \
                _cross_entropy_loss(train_y_bc, train_labels_bc) / len(train_labels_bc)
        train_y_tr = _sigmoid(np.dot(train_data_tr, w_tr) + b_tr)
        train_acc_tr = _accuracy(train_y_tr, train_labels_tr)
        train_loss_ep_tr[epoch] = _cross_entropy_loss(train_y_tr, train_labels_tr) / len(train_labels_tr)

    weights_tr = np.append(w_tr, b_tr)

    return weights_tr, train_loss_ep_tr, train_loss_epidx_tr, train_acc_tr

main

之后对得到的权重进行测试,对结果进行绘图。不再赘述。

def test(test_data, test_labels, test_weights):

    test_y_ts = _sigmoid(np.dot(test_data, test_weights[:-1]) + test_weights[-1])
    loss_ts = _cross_entropy_loss(test_y_ts, test_labels) / len(test_labels)
    return loss_ts, test_y_ts


def draw(datain, data_name='Loss', is_save=False):

    plt.ion()
    plt.figure()
    plt.plot(datain)
    # plt.plot(dev_loss)
    plt.title(data_name)
    # plt.legend(['train', 'dev'])
    if is_save:
        plt.savefig('loss_idx.png')
    return


def main():

    train_data, train_labels, public_data, public_labels = data_set(X_train, Y_train, permutation=True)
    # 10%的public data 90%作为train data

    weights, loss_s, loss_all, train_acc = train(train_data, train_labels)
    print('Train loss {}'.format(loss_s[-1]))
    test_loss, test_y = test(public_data, public_labels, weights)
    print('Test loss {}'.format(test_loss))

    test_acc = _accuracy(test_y, public_labels)
    print('train accuracy {}'.format(train_acc))
    print('test accuracy {}'.format(test_acc))

    draw(loss_s, 'loss_s')
    draw(loss_all, 'loss_all')
    plt.ioff()
    plt.show()

    return


main()

# Size of training set: 48830
# Size of public set: 5426
# Dimension of data: 510
# idxs:4883
# Train loss 0.2686176003487162
# Test loss 0.2644209674733678
# train accuracy 0.8301105719676407
# test accuracy 0.830216568393888

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值