kaggle实战

对应沐神B站视频15_P1


import hashlib
import os
import tarfile
import zipfile
import requests

import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l


# 如果缓存目录中已经存在此数据集文件,并且其sha-1与存储在DATA_HUB中的相匹配, 我们将使用缓存的文件,以避免重复的下载
def download(name, cache_dir=os.path.join('..', 'data')):
    """下载一个DATA_HUB中的文件,返回本地文件名"""
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # 命中缓存
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname

def get_net():
    net = nn.Sequential(nn.Linear(in_features,1))   # 单层线性回归自动初始化w,b 输入331 输出1
    return net

def log_rmse(net, features, labels):             # 相对误差
    # 为了在取对数时进一步稳定该值,将小于1的值设置为1
    clipped_preds = torch.clamp(net(features), 1, float('inf'))  # y_hat[1168,1]中小于1的值置为1
    rmse = torch.sqrt(loss(torch.log(clipped_preds),
                           torch.log(labels)))
    return rmse.item()   # .item()把数据从tensor取出来,变成python的数据类型

def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []   # 创建空列表
    train_iter = d2l.load_array((train_features, train_labels), batch_size)
    # 这里使用的是Adam优化算法,类似比较平滑的SGD
    optimizer = torch.optim.Adam(net.parameters(),        # 算梯度
                                 lr = learning_rate,
                                 weight_decay = weight_decay)
    for epoch in range(num_epochs):        # 扫100遍
        for X, y in train_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))  # net中参数变了
        # print(train_ls)
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls

def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k   # //向下取整 1460//5=292
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)  # slice对数组部分截取,每次截取一个fold_size
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:  # 第一次看到
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)  # 0按行拼接
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid

def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
           batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):  # i当前第几折
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        train_l_sum += train_ls[-1]   # train_ls中最后一个元素即第i折中100个epochs后的结果
        valid_l_sum += valid_ls[-1]
        if i == 0:
            d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
                     xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
                     legend=['train', 'valid'], yscale='log')
        print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, '
              f'验证log rmse{float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum / k

"""
def download_extract(name, folder=None):
    '''下载并解压zip/tar文件'''
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, '只有zip/tar文件可以被解压缩'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir
def download_all():
    '''下载DATA_HUB中的所有文件'''
    for name in DATA_HUB:
        download(name)
"""
if __name__ == '__main__':
    """数据加载"""
    DATA_HUB = dict()
    DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
    DATA_HUB['kaggle_house_train'] = (
        DATA_URL + 'kaggle_house_pred_train.csv',
        '585e9cc93e70b39160e7921475f9bcd7d31219ce')
    DATA_HUB['kaggle_house_test'] = (
        DATA_URL + 'kaggle_house_pred_test.csv',
        'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
    train_data = pd.read_csv(download('kaggle_house_train'))
    test_data = pd.read_csv(download('kaggle_house_test'))
    # print(train_data.shape)  # (1460, 81)
    # print(test_data.shape)   # (1459, 80)
    # print(train_data.iloc[0:4, :])  # .iloc取特定的行、列
    """数据处理"""
    # pd.concat图像的重塑 存在all_features里
    all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
    # print(all_features.shape)  # (2919, 79)
    # 若无法获得测试数据,则可根据训练数据计算均值和标准差
    numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index  # 返回数值特征的索引(列名)
    # print(numeric_features)
    # lambda x: 把x传入:后表达式并返回x的值。某列中每个x做 -均值再/方差 操作 (归一化,均值为0,方差为1)
    all_features[numeric_features] = all_features[numeric_features].apply(
        lambda x: (x - x.mean()) / (x.std()))
    # 在标准化数据之后,所有均值消失,因此我们可以将缺失值(消失的均值和没采样到的值)设置为0
    all_features[numeric_features] = all_features[numeric_features].fillna(0)
    # “Dummy_na=True”将“na”(缺失值)视为有效的特征值,并为其创建指示符特征
    all_features = pd.get_dummies(all_features, dummy_na=True)  # one-hot编码
    # print(all_features.shape) #(2919, 331)
    """从pandas格式中提取NumPy格式,并将其转换为张量表示"""
    n_train = train_data.shape[0]  # 1460
    train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)  # [1460,331] 行列
    test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)   # [1459,331]
    train_labels = torch.tensor(                                                       # [1460,1]
        train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)
    """训练"""
    loss = nn.MSELoss()   # 使用均方误差,也称为平方 L2 范数,默认情况下,它返回所有样本损失的平均值
    in_features = train_features.shape[1]  # 331
    k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
    train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
                              weight_decay, batch_size)
    print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
          f'平均验证log rmse: {float(valid_l):f}')
    d2l.plt.show()

笔记

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

CPU疼

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值