对应沐神B站视频15_P1
import hashlib
import os
import tarfile
import zipfile
import requests
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l
# 如果缓存目录中已经存在此数据集文件,并且其sha-1与存储在DATA_HUB中的相匹配, 我们将使用缓存的文件,以避免重复的下载
def download(name, cache_dir=os.path.join('..', 'data')):
"""下载一个DATA_HUB中的文件,返回本地文件名"""
assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
url, sha1_hash = DATA_HUB[name]
os.makedirs(cache_dir, exist_ok=True)
fname = os.path.join(cache_dir, url.split('/')[-1])
if os.path.exists(fname):
sha1 = hashlib.sha1()
with open(fname, 'rb') as f:
while True:
data = f.read(1048576)
if not data:
break
sha1.update(data)
if sha1.hexdigest() == sha1_hash:
return fname # 命中缓存
print(f'正在从{url}下载{fname}...')
r = requests.get(url, stream=True, verify=True)
with open(fname, 'wb') as f:
f.write(r.content)
return fname
def get_net():
net = nn.Sequential(nn.Linear(in_features,1)) # 单层线性回归自动初始化w,b 输入331 输出1
return net
def log_rmse(net, features, labels): # 相对误差
# 为了在取对数时进一步稳定该值,将小于1的值设置为1
clipped_preds = torch.clamp(net(features), 1, float('inf')) # y_hat[1168,1]中小于1的值置为1
rmse = torch.sqrt(loss(torch.log(clipped_preds),
torch.log(labels)))
return rmse.item() # .item()把数据从tensor取出来,变成python的数据类型
def train(net, train_features, train_labels, test_features, test_labels,
num_epochs, learning_rate, weight_decay, batch_size):
train_ls, test_ls = [], [] # 创建空列表
train_iter = d2l.load_array((train_features, train_labels), batch_size)
# 这里使用的是Adam优化算法,类似比较平滑的SGD
optimizer = torch.optim.Adam(net.parameters(), # 算梯度
lr = learning_rate,
weight_decay = weight_decay)
for epoch in range(num_epochs): # 扫100遍
for X, y in train_iter:
optimizer.zero_grad()
l = loss(net(X), y)
l.backward()
optimizer.step()
train_ls.append(log_rmse(net, train_features, train_labels)) # net中参数变了
# print(train_ls)
if test_labels is not None:
test_ls.append(log_rmse(net, test_features, test_labels))
return train_ls, test_ls
def get_k_fold_data(k, i, X, y):
assert k > 1
fold_size = X.shape[0] // k # //向下取整 1460//5=292
X_train, y_train = None, None
for j in range(k):
idx = slice(j * fold_size, (j + 1) * fold_size) # slice对数组部分截取,每次截取一个fold_size
X_part, y_part = X[idx, :], y[idx]
if j == i:
X_valid, y_valid = X_part, y_part
elif X_train is None: # 第一次看到
X_train, y_train = X_part, y_part
else:
X_train = torch.cat([X_train, X_part], 0) # 0按行拼接
y_train = torch.cat([y_train, y_part], 0)
return X_train, y_train, X_valid, y_valid
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
batch_size):
train_l_sum, valid_l_sum = 0, 0
for i in range(k): # i当前第几折
data = get_k_fold_data(k, i, X_train, y_train)
net = get_net()
train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
weight_decay, batch_size)
train_l_sum += train_ls[-1] # train_ls中最后一个元素即第i折中100个epochs后的结果
valid_l_sum += valid_ls[-1]
if i == 0:
d2l.plot(list(range(1, num_epochs + 1)), [train_ls, valid_ls],
xlabel='epoch', ylabel='rmse', xlim=[1, num_epochs],
legend=['train', 'valid'], yscale='log')
print(f'折{i + 1},训练log rmse{float(train_ls[-1]):f}, '
f'验证log rmse{float(valid_ls[-1]):f}')
return train_l_sum / k, valid_l_sum / k
"""
def download_extract(name, folder=None):
'''下载并解压zip/tar文件'''
fname = download(name)
base_dir = os.path.dirname(fname)
data_dir, ext = os.path.splitext(fname)
if ext == '.zip':
fp = zipfile.ZipFile(fname, 'r')
elif ext in ('.tar', '.gz'):
fp = tarfile.open(fname, 'r')
else:
assert False, '只有zip/tar文件可以被解压缩'
fp.extractall(base_dir)
return os.path.join(base_dir, folder) if folder else data_dir
def download_all():
'''下载DATA_HUB中的所有文件'''
for name in DATA_HUB:
download(name)
"""
if __name__ == '__main__':
"""数据加载"""
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
DATA_HUB['kaggle_house_train'] = (
DATA_URL + 'kaggle_house_pred_train.csv',
'585e9cc93e70b39160e7921475f9bcd7d31219ce')
DATA_HUB['kaggle_house_test'] = (
DATA_URL + 'kaggle_house_pred_test.csv',
'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))
# print(train_data.shape) # (1460, 81)
# print(test_data.shape) # (1459, 80)
# print(train_data.iloc[0:4, :]) # .iloc取特定的行、列
"""数据处理"""
# pd.concat图像的重塑 存在all_features里
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
# print(all_features.shape) # (2919, 79)
# 若无法获得测试数据,则可根据训练数据计算均值和标准差
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index # 返回数值特征的索引(列名)
# print(numeric_features)
# lambda x: 把x传入:后表达式并返回x的值。某列中每个x做 -均值再/方差 操作 (归一化,均值为0,方差为1)
all_features[numeric_features] = all_features[numeric_features].apply(
lambda x: (x - x.mean()) / (x.std()))
# 在标准化数据之后,所有均值消失,因此我们可以将缺失值(消失的均值和没采样到的值)设置为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)
# “Dummy_na=True”将“na”(缺失值)视为有效的特征值,并为其创建指示符特征
all_features = pd.get_dummies(all_features, dummy_na=True) # one-hot编码
# print(all_features.shape) #(2919, 331)
"""从pandas格式中提取NumPy格式,并将其转换为张量表示"""
n_train = train_data.shape[0] # 1460
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32) # [1460,331] 行列
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32) # [1459,331]
train_labels = torch.tensor( # [1460,1]
train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)
"""训练"""
loss = nn.MSELoss() # 使用均方误差,也称为平方 L2 范数,默认情况下,它返回所有样本损失的平均值
in_features = train_features.shape[1] # 331
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr,
weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
f'平均验证log rmse: {float(valid_l):f}')
d2l.plt.show()
笔记