import hashlib
import os
import tarfile
import zipfile
import requests
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
def download(name,cache_dir=os.path.join('.','data')):
# 得到的cache_dir = ./data os.path.join表示路径拼接
assert name in DATA_HUB, f"{name}不存在于{DATA_HUB}"
url,sha1_hash = DATA_HUB[name]
# print(url) http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_train.csv
# print(sha1_hash) 585e9cc93e70b39160e7921475f9bcd7d31219ce
print(url.split('/')[-1])
os.makedirs(cache_dir,exist_ok=True) # 创建多级目录
fname = os.path.join(cache_dir,url.split('/')[-1]) # .\data\kaggle_house_pred_train.csv
print(fname)
if os.path.exists(fname):
sha1 = hashlib.sha1() # 加密用的
with open(fname,'rb') as f:
while True:
data = f.read(1048576) # 每次读取1048576字节
if not data:
break
sha1.update(data)
if sha1.hexdigest()==sha1_hash:
return fname
print(f'正在从{url}下载{fname}...')
r = requests.get(url,stream=True,verify=True)
with open(fname,'wb') as f:
f.write(r.content)
return fname
# 没有用到过
def download_extract(name,folder=None):
fname = download(name)
base_dir = os.path.dirname(fname)
data_dir,ext = os.path.splitext(fname)
if ext=='.zip':
fp = zipfile.ZipFile(fname,'r')
elif ext in ('.tar','.gz'):
fp = tarfile.open(fname,'r')
else:
assert False
fp.extractall(base_dir)
return os.path.join(base_dir,folder) if folder else data_dir
# 没有用到过
def download_all():
for name in DATA_HUB:
download(name)
DATA_HUB['kaggle_house_train'] = (DATA_URL + 'kaggle_house_pred_train.csv','585e9cc93e70b39160e7921475f9bcd7d31219ce')
DATA_HUB['kaggle_house_test'] = (DATA_URL+'kaggle_house_pred_test.csv','fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))
print("train_data.shape",train_data.shape)
print("test_data.shape",test_data.shape)
# print(train_data.iloc[0:4,[0,1,2,3,-3,-2,-1]]) # 提取0-3行中0,1,2,3,-3,-2,-1列的元素
# 去掉第一列id属性
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
numeric_features = all_features.dtypes[all_features.dtypes!='object'].index # 获取数值类型的列名
print("numeric_features",numeric_features)
all_features[numeric_features] = all_features[numeric_features].apply(lambda x:(x-x.mean())/(x.std())) # lambda 相当于一个匿名函数 x为参数 :后面是函数体
all_features[numeric_features] = all_features[numeric_features].fillna(0) # 对NA和NAN的值进行填充
all_features = pd.get_dummies(all_features,dummy_na=True) # 将NA视为有效特征值
print(all_features.shape)
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values,dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values,dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1,1),dtype=torch.float32)
# 定义损失函数 均方损失函数 平方和
loss = nn.MSELoss()
# 获取列数(特征个数)
in_features = train_features.shape[1]
def get_net():
net = nn.Sequential(nn.Linear(in_features,1))
return net
def log_rmse(net,features,labels):
clipped_preds = torch.clamp(net(features),1,float('inf')) # 将输出结果锁在1~inf之间
rmse = torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels)))
return rmse.item() # 可以提高显示精度
def train(net,train_features,train_labels,test_features,test_labels,num_epochs,learning_rate,weight_decay,batch_size):
# 存放log_rmse的
train_ls = []
test_ls = []
train_iter = d2l.load_array((train_features,train_labels),batch_size)
optimizer = torch.optim.Adam(net.parameters(),lr = learning_rate,weight_decay=weight_decay)
for epoch in range(num_epochs):
for x,y in train_iter:
optimizer.zero_grad()
l = loss(net(x),y)
l.backward()
optimizer.step()
train_ls.append(log_rmse(net,train_features,train_labels))
if test_labels is not None:
test_ls.append(log_rmse(net,test_features,test_labels))
return train_ls,test_ls
def get_k_fold_data(k,i,x,y):
assert k > 1
fold_size = x.shape[0] // k
x_train,y_train = None,None
for j in range(k):
idx = slice(j*fold_size,(j+1)*fold_size)
x_part,y_part = x[idx,:],y[idx]
# 留出k折中的一份 用于验证
if j==i:
x_valid,y_valid = x_part,y_part
# 如果是第一次给x_train赋值 则直接等
elif x_train is None:
x_train,y_train = x_part,y_part
# 不是第一次给x_train赋值 那就往原来的x_train里面添加
else:
x_train = torch.cat([x_train,x_part],0)
y_train = torch.cat([y_train,y_part],0)
return x_train,y_train,x_valid,y_valid
def k_fold(k,x_train,y_train,num_epochs,learning_rate,weight_decay,batch_size):
train_l_sum,valid_l_sum = 0,0
for i in range(k):
data = get_k_fold_data(k,i,x_train,y_train)
net = get_net()
train_ls,valid_ls = train(net,*data,num_epochs,learning_rate,weight_decay,batch_size)
# 因为train_ls是个列表 所以最后一个元素就是刚才训练的损失
train_l_sum = train_l_sum + train_ls[-1]
valid_l_sum = valid_l_sum + valid_ls[-1]
if i==0:
d2l.plot(list(range(1,num_epochs+1)),[train_ls,valid_ls],xlabel='epoch',ylabel='rmse',xlim = [1,num_epochs],legend=['train','valid'],yscale='log')
# 用于解决d2l.plot不显示的问题
d2l.plt.show()
print(f'折{i+1},训练log rmse{float(train_ls[-1])} 验证log rmse{float(valid_ls[-1])}')
return train_l_sum/k,valid_l_sum/k
k,num_epochs,lr,weight_decay,batch_size = 5,100,5,0,64
train_l,valid_l = k_fold(k,train_features,train_labels,num_epochs,lr,weight_decay,batch_size)
print(f'{k}折交叉验证:平均训练log rmse{float(train_l)}:f,' f'平均验证log rmse{float(valid_l):f}')
def train_and_pred(train_features,test_features,train_labels,test_data,num_epochs,lr,weight_decay,batch_size):
net = get_net()
train_ls,_ = train(net,train_features,train_labels,None,None,num_epochs,lr,weight_decay,batch_size)
d2l.plot(np.arange(1,num_epochs+1),[train_ls],xlabel='epoch',ylabel='log rmse',xlim=[1,num_epochs],yscale='log')
d2l.plt.show()
preds = net(test_features).detach().numpy()
test_data['SalePrice'] = pd.Series(preds.reshape(1,-1)[0])
submission = pd.concat([test_data['Id'],test_data['SalePrice']],axis=1)
submission.to_csv('submission.csv',index=False)
train_and_pred(train_features,test_features,train_labels,test_data,num_epochs,lr,weight_decay,batch_size)