李宏毅机器学习2021系列 作业2-年收入判断:7步编写 gradient descent 实现 logistic regression完成收入预测

二元分类是机器学习中最基础的问题之一,在这份教学中,你将学会如何实作一个线性二元分类器,来根据人们的个人资料,判断其年收入是否高于 50,000 美元。我们将以两种方法: logistic regression 与 generative model,来达成以上目的,你可以尝试了解、分析两者的设计理念及差别。 实现二分类任务:

个人收入是否超过50000元?

直接上代码

1.导入数据

np.random.seed(0)
X_train_fpath = 'work/data/X_train'
Y_train_fpath = 'work/data/Y_train'
X_test_fpath = 'work/data/X_test'
output_fpath = 'work/output_{}.csv'

# 将csv文件解析为numpy数组
with open(X_train_fpath) as f:
    next(f)
    X_train = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)
with open(Y_train_fpath) as f:
    next(f)
    Y_train = np.array([line.strip('\n').split(',')[1] for line in f], dtype = float)
with open(X_test_fpath) as f:
    next(f)
    X_test = np.array([line.strip('\n').split(',')[1:] for line in f], dtype = float)

2.标准/规范化

def _normalize(X, train=True, specified_column=None, X_mean=None, X_std=None):
    #这个函数规格化X的特定列。

    #处理测试数据时重用训练数据的均值和标准方差。
    # 参数说明:
    #     X: 要处理的数据
    #     train: 'True' 处理训练集, 'False'处理测试集
    #     specific_column:要被规范化的列的索引.如果是'None',则所有列都将被规范化。
    #     X_mean:训练数据的平均值,当train = 'False'才用
    #     X_std: 训练数据的标准差, 当train = 'False'才用
    # 输出:
    #     X: 规范化后的数据
    #     X_mean:训练集的均值
    #     X_std: 训练集的标准差
    if specified_column is None:
        specified_column = np.arange(X.shape[1])
    if train:
        X_mean = np.mean(X[:, specified_column], 0).reshape(1, -1)
        X_std = np.std(X[:, specified_column], 0).reshape(1, -1)

    X[:, specified_column] = (X[:, specified_column] - X_mean) / (X_std + 1e-8)
    return X, X_mean, X_std

3.分割测试集和验证集

def _train_dev_split(X, Y, dev_ratio = 0.25):
    # This function spilts data into training set and development set.
    train_size = int(len(X) * (1 - dev_ratio))
    return X[:train_size], Y[:train_size], X[train_size:], Y[train_size:]

# 先规范化训练集和测试集
X_train, X_mean, X_std = _normalize(X_train, train = True)
X_test, _, _ = _normalize(X_test, train = False, specified_column = None, X_mean = X_mean, X_std = X_std)
    
# 将数据分为训练集和验证集
dev_ratio = 0.1
# 9:1
X_train, Y_train, X_eval, Y_eval = _train_dev_split(X_train, Y_train, dev_ratio = dev_ratio)

print('Size of training set: {}'.format(X_train.shape[0]))
print('Size of eval set: {}'.format(X_eval.shape[0]))
print('Size of testing set: {}'.format(X_test.shape[0]))
print('Dimension of data: {}'.format(data_dim))

4.数据增强及小工具

def _shuffle(X, Y):
    # 这个函数扰乱两个等长的列表/数组X和Y。
    randomize = np.arange(len(X))
    np.random.shuffle(randomize)
    return (X[randomize], Y[randomize])

def _sigmoid(z):
    # Sigmoid函数可以用来计算概率。

    #为了避免溢出,设置了最小/最大输出值。
    return np.clip(1 / (1.0 + np.exp(-z)), 1e-8, 1 - (1e-8))

def _f(X, w, b):
    #这是logistic回归函数,由w和b参数化
    # 参数说明:
    #     X: 输入, shape = [batch_size, data_dimension]
    #     w: weight vector, shape = [data_dimension, ]
    #     b: bias, scalar
    # 输出:
    #     X每一行正标记的预测概率,shape = [batch_size,]
    return _sigmoid(np.matmul(X, w) + b)

def _predict(X, w, b):
    #这个函数为X的每一行返回一个真值预测

    #通过对logistic回归函数的结果舍入。
    return np.round(_f(X, w, b)).astype(np.int)
    
def _accuracy(Y_pred, Y_label):
    # 此函数计算预测精度
    acc = 1 - np.mean(np.abs(Y_pred - Y_label))
    return acc

5.损失函数和梯度

# 交叉熵损失 if 0-1损失 第二项为0
def _cross_entropy_loss(y_pred, Y_label):
    # 这个函数计算交叉熵。
    #
    # 参数说明:
    #     y_pred: 概率预测,float向量
    #     Y_label:真值标签,boolean向量
    # 输出:
    #     交叉熵,标量
    cross_entropy = -np.dot(Y_label, np.log(y_pred)) - np.dot((1 - Y_label), np.log(1 - y_pred))
    return cross_entropy

# 
def _gradient(X, Y_label, w, b):
    # 这个函数计算关于权重w和偏误b的交叉熵损失的梯度。
    y_pred = _f(X, w, b)
    pred_error = Y_label - y_pred
    w_grad = -np.sum(pred_error * X.T, 1)
    b_grad = -np.sum(pred_error)
    return w_grad, b_grad

6.训练及可视化

# 初始化w和b
w = np.zeros((data_dim, )) 
b = np.zeros((1,))

# 训练参数    
EPOCH = 10
batch_size = 128
learning_rate = 0.2

# 储存下每一次迭代的loss和精确度,用来画图
train_loss = []
eval_loss = []
train_acc = []
eval_acc = []

# 计算参数更新的次数
adagrad_step = 1
batch_step = 0
# 迭代训练
for epoch in range(EPOCH):
    # 随机打乱数据
    X_train, Y_train = _shuffle(X_train, Y_train)
        
    # Mini-batch training
    step = 0
    steps = int(np.floor(train_size / batch_size))
    for idx in range(steps):  # floor(48830/128)=382
        X = X_train[idx*batch_size:(idx+1)*batch_size]
        Y = Y_train[idx*batch_size:(idx+1)*batch_size]

        # 计算梯度
        w_grad, b_grad = _gradient(X, Y, w, b)
            
        # gradient descent update
        # learning rate decay with time
        w = w - learning_rate/np.sqrt(adagrad_step) * w_grad
        b = b - learning_rate/np.sqrt(adagrad_step) * b_grad
        
        step += 1
        adagrad_step += 1

        # Compute loss and accuracy of training set and development set
        y_train_pred = _f(X_train, w, b)
        Y_train_pred = np.round(y_train_pred)
        y_eval_pred = _f(X_eval, w, b)
        Y_eval_pred = np.round(y_eval_pred)

        acc_train = _accuracy(Y_train_pred, Y_train)
        loss_train = _cross_entropy_loss(y_train_pred, Y_train) / train_size
        acc_eval = _accuracy(Y_eval_pred, Y_eval)
        loss_eval = _cross_entropy_loss(y_eval_pred, Y_eval) / eval_size

        if step % 50 == 0 or step == steps:
            print(f'Epoch {epoch}/{EPOCH}, step {step}/{steps} : train_loss = {loss_train}, train_acc = {acc_train}, eval_loss = {loss_eval}, eval_acc = {acc_eval}')

        train_acc.append(acc_train)
        train_loss.append(loss_train)
        eval_acc.append(acc_eval)
        eval_loss.append(loss_eval)

print('Training loss: {}'.format(train_loss[-1]))
print('Eval loss: {}'.format(eval_loss[-1]))
print('Training accuracy: {}'.format(train_acc[-1]))
print('Eval accuracy: {}'.format(eval_acc[-1]))

# Loss curve
# plt.figure(figsize=(16, 8))
plt.plot(train_loss)
plt.plot(eval_loss)
plt.title('Loss')
plt.legend(['train', 'eval'])
plt.savefig('loss.png')
plt.show()

# Accuracy curve
# plt.figure(figsize=(16, 8))
plt.plot(train_acc)
plt.plot(eval_acc)
plt.title('Accuracy')
plt.legend(['train', 'eval'])
plt.savefig('acc.png')
plt.show()

7.预测

# 输出到output_logistic.csv
# Predict testing labels
predictions = _predict(X_test, w, b)
with open(output_fpath.format('logistic'), 'w') as f:
    f.write('id,label\n')
    for i, label in  enumerate(predictions):
        f.write('{},{}\n'.format(i, label))

# Print out the most significant weights
ind = np.argsort(np.abs(w))[::-1]
with open(X_test_fpath) as f:
    content = f.readline().strip('\n').split(',')
features = np.array(content)
for i in ind[0:10]:
    print(features[i], w[i])

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值