房价预测
import hashlib
import os
import tarfile
import zipfile
import requests
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
def download(name, cache_dir=os.path.join('.', 'data')):
"""下载一个DATA_HUB中的文件,返回本地文件名"""
assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
url, sha1_hash = DATA_HUB[name]
os.makedirs(cache_dir, exist_ok=True)
fname = os.path.join(cache_dir, url.split('/')[-1])
if os.path.exists(fname):
sha1 = hashlib.sha1()
with open(fname, 'rb') as f:
while True:
data = f.read(1048576)
if not data:
break
sha1.update(data)
if sha1.hexdigest() == sha1_hash:
return fname
print(f'正在从{url}下载{fname}...')
r = requests.get(url, stream=True, verify=True)
with open(fname, 'wb') as f:
f.write(r.content)
return fname
def download_extract(name, folder=None):
"""下载并解压zip/tar文件"""
fname = download(name)
base_dir = os.path.dirname(fname)
data_dir, ext = os.path.splitext(fname)
if ext == '.zip':
fp = zipfile.ZipFile(fname, 'r')
elif ext in ('.tar', '.gz'):
fp = tarfile.open(fname, 'r')
else:
assert False, '只有zip/tar文件可以被解压缩'
fp.extractall(base_dir)
return os.path.join(base_dir, folder) if folder else data_dir
def download_all():
"""下载DATA_HUB中的所有文件"""
for name in DATA_HUB:
download(name)
%matplotlib inline
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l
DATA_HUB['kaggle_house_train'] = (
DATA_URL + 'kaggle_house_pred_train.csv',
'585e9cc93e70b39160e7921475f9bcd7d31219ce')
DATA_HUB['kaggle_house_test'] = (
DATA_URL + 'kaggle_house_pred_test.csv',
'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))
all_feature = pd.concat((train_data.iloc[:,1:-1],train_data.iloc[:,1:]))
numeric_features = all_feature.dtypes[all_feature.dtypes != 'object'].index
all_feature[numeric_features] = all_feature[numeric_features].apply(
lambda x: (x - x.mean() / (x.std()))
)
all_feature[numeric_features] = all_feature[numeric_features].fillna(0)
all_feature = pd.get_dummies(all_feature,dummy_na=True)
train_data_split = train_data.shape[0]
train_dataset = torch.tensor(all_feature[:train_data_split].values,dtype=torch.float32)
test_dataset = torch.tensor(all_feature[train_data_split:].values,dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1,1), dtype=torch.float32)
dataloader
import torch.utils.data as data
import torch
train_d = torch.cat([train_dataset,train_labels],dim=1)
train_dataloader = data.DataLoader(train_d,batch_size=126,shuffle=True)
定义loss和net
loss = nn.MSELoss()
in_features = train_dataset.shape[1]
def get_net():
net = nn.Sequential(nn.Linear(in_features,1))
return net
net = get_net()
optimizer = torch.optim.Adam(net.parameters(), lr=0.01)
定义相对误差公式
# 相对误差 ( y - 'y ) / y
def log_rmse(net,features,labels):
clipped_preds = torch.clamp(net(features),1,float('inf'))
preb_mse = torch.sqrt(loss(torch.log(clipped_preds), torch.log(labels)))
return preb_mse.item()
模型训练
epochs = 100
train_ls, log_mse = [],[]
for epoch in range(epochs):
if net == nn.Module():
net.train()
ls_train,ls_test = 0,0
for X in train_dataloader:
data = X[:,:-1]
label = X[:,-1].reshape(-1,1)
optimizer.zero_grad()
preb = net(data)
l = loss(preb,label)
l.backward()
optimizer.step()
ls_train+=l.item()
train_ls.append(ls_train)
preb_mse = log_rmse(net,train_dataset,train_labels)
log_mse.append(preb_mse)
print("epoch is ",epoch," Mse loss is ",ls_train, " log_mse is ",preb_mse)
训练结果
epoch is 0 Mse loss is 462676588544.0 log_mse is 4.277717590332031
epoch is 1 Mse loss is 450499411968.0 log_mse is 3.5420501232147217
epoch is 2 Mse loss is 440024502272.0 log_mse is 3.127312183380127
epoch is 3 Mse loss is 425981331456.0 log_mse is 2.835536241531372
epoch is 4 Mse loss is 414730248192.0 log_mse is 2.611187696456909
epoch is 5 Mse loss is 403009087488.0 log_mse is 2.431793689727783
epoch is 6 Mse loss is 394215604224.0 log_mse is 2.2810003757476807
epoch is 7 Mse loss is 380868847616.0 log_mse is 2.1509878635406494
epoch is 8 Mse loss is 374684563456.0 log_mse is 2.0380849838256836
epoch is 9 Mse loss is 364873803776.0 log_mse is 1.937126874923706
...........................
绘制结果
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure()
plt.plot(train_ls)
plt.show()
%matplotlib inline
import matplotlib.pyplot as plt
plt.figure()
plt.plot(log_mse)
plt.show()