动手学深度学习(Pytorch版)代码实践 -深度学习基础-13Kaggle竞赛:2020加州房价预测

13Kaggle竞赛:2020加州房价预测

# 导入所需的库
import numpy as np
import pandas as pd
import torch
import hashlib
import os
import tarfile
import zipfile
import requests
from torch import nn
from d2l import torch as d2l

# 读取训练和测试数据
train_data = pd.read_csv('../data/california-house-prices/train.csv')
test_data = pd.read_csv('../data/california-house-prices/test.csv')

# 打印数据形状
# print(train_data.shape)
# print(test_data.shape)
# (47439, 41)
# (31626, 40)

# 打印前4行的部分列
# print(train_data.iloc[0:4, [0, 1, 2, 3, 4, 5, 6, -3, -2, -1]])

# 合并训练和测试数据,用于特征工程
all_features = pd.concat((train_data.iloc[:, train_data.columns != 'Sold Price'], test_data.iloc[:, 1:]))
# all_features.info()
# print(all_features.shape)
# (79065, 40)

# 去除ID列
all_features = all_features.iloc[:, 1:]

# 将字符型日期列转化为日期型
all_features['Listed On'] = pd.to_datetime(all_features['Listed On'], format="%Y-%m-%d")
all_features['Last Sold On'] = pd.to_datetime(all_features['Last Sold On'], format="%Y-%m-%d")

# 标准化数值特征
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)

# 打印每个字符型特征的唯一值数量
# for in_object in all_features.dtypes[all_features.dtypes == 'object'].index:
#     print(in_object.ljust(20), len(all_features[in_object].unique()))
"""
in_object.ljust(20):将列名左对齐,并填充空格使其长度至少为20个字符,这样打印时更整齐。
len(all_features[in_object].unique()):计算该列中唯一值的数量。
便于后续的独热编码,防止内存爆炸
"""

# 选择需要的特征
features = list(numeric_features)
features.extend(['Type'])
all_features = all_features[features[:]]

# 独热编码
all_features = pd.get_dummies(all_features, dummy_na=True, dtype=float)
# print(all_features.shape)
# (79065, 195)

# 查看全部特征的数据类型
# print(all_features.dtypes.unique())

# 从pandas格式中提取NumPy格式,并将其转换为张量表示用于训练
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)

train_labels = torch.tensor(
    train_data['Sold Price'].values.reshape(-1, 1),
    dtype=torch.float32
)

# 是否使用GPU训练
if not torch.cuda.is_available():
    print('CUDA is not available. Training on CPU ...')
else:
    print('CUDA is available. Training on GPU ...')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 将特征和标签移到设备上
train_features = train_features.to(device)
test_features = test_features.to(device)
train_labels = train_labels.to(device)

# 定义均方误差损失函数
loss = nn.MSELoss()

# 输入特征的数量
in_features = train_features.shape[1]
# print(in_features)
# 195

# 定义神经网络模型
dropout1, dropout2, dropout3 = 0.2, 0.3, 0.5

def get_net():
    net = nn.Sequential(
        nn.Flatten(),
        nn.Linear(in_features, 128), nn.ReLU(),
        # nn.Dropout(dropout1),

        nn.Linear(128, 64), nn.ReLU(),
        # nn.Dropout(dropout2),

        nn.Linear(64, 32), nn.ReLU(),
        # nn.Dropout(dropout3),

        nn.Linear(32, 1))
    return net.to(device)  # 使用GPU

# 计算对数均方根误差
def log_rmse(net, features, labels):
    """
    使用 torch.clamp 函数将预测值的下限限制在 1,确保所有预测值至少为 1。
    这是为了避免在取对数时出现负值或零值,因为对数在这些点上未定义或会导致数值问题。
    """
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))

    # 将 PyTorch 张量转换为 Python 标量
    return rmse.item()

# 训练模型函数
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    
    train_ls, tets_ls = [], []  # 用于存储每个epoch的训练和测试损失
    train_iter = d2l.load_array((train_features, train_labels), batch_size)  # 创建训练数据迭代器
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate,
                                 weight_decay=weight_decay)  # 定义Adam优化器
    # weight_decay: 权重衰减,用于L2正则化。

    for epoch in range(num_epochs):
        for X, y in train_iter:
            X, y = X.to(device), y.to(device)  # 确保批次数据在GPU上
            optimizer.zero_grad()  # 梯度清零
            l = loss(net(X), y)  # 计算损失
            l.backward()  # 反向传播
            optimizer.step()  # 更新模型参数

        # 计算并记录训练集上的对数均方根误差。
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            # 计算并记录测试集上的对数均方根误差
            tets_ls.append(log_rmse(net, test_features, test_labels))

    return train_ls, tets_ls

# K折交叉验证
# 它选择第i个切片作为验证数据,其余部分作为训练数据
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train.to(device), y_train.to(device), X_valid.to(device), y_valid.to(device)  # 确保在GPU上

# 在K折交叉验证中训练K次后,返回训练和验证误差的平均值。
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
           batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        # 将 train_ls 列表中的最新值(即当前 epoch 的训练损失)累加到 train_l_sum 变量中。
        valid_l_sum += valid_ls[-1]
        if i == 0:
            d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
                     xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
                     legend=['train', 'valid'], yscale='log')
        print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, '
              f'验证log rmse{float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum / k

# 定义训练参数
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 0.01, 0, 256

# 进行K折交叉验证
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
                          weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
      f'平均验证log rmse: {float(valid_l):f}')

d2l.plt.show() 

# 提交Kaggle预测
def train_and_pred(train_features, test_features, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net()
    train_ls, _ = train(net, train_features, train_labels, None, None,
                        num_epochs, lr, weight_decay, batch_size)
    d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel='epoch',
             ylabel='log rmse', xlim=[1, num_epochs], yscale='log')
    print(f'训练log rmse:{float(train_ls[-1]):f}')
    # 将网络应用于测试集,并将结果从GPU转移到CPU再转换为NumPy数组
    preds = net(test_features).detach().cpu().numpy()
    # 将其重新格式化以导出到Kaggle
    test_data['Sold Price'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['Sold Price']], axis=1)
    submission.to_csv('../data/california-house-prices/submission.csv', index=False)

# 训练模型并进行预测
train_and_pred(train_features, test_features, train_labels, test_data,
               num_epochs, lr, weight_decay, batch_size)

d2l.plt.show()

运行结果:

1,训练log rmse0.356748, 验证log rmse0.3316662,训练log rmse0.337252, 验证log rmse0.3418753,训练log rmse0.317294, 验证log rmse0.3245164,训练log rmse0.337175, 验证log rmse0.3606255,训练log rmse0.356537, 验证log rmse0.379667
5-折验证: 平均训练log rmse: 0.341001, 平均验证log rmse: 0.347670
训练log rmse:0.307162

在这里插入图片描述
在这里插入图片描述

竞赛得分:

在这里插入图片描述

  • 24
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

@李思成

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值