kaggle房屋价格预测-CSDN博客

本文链接：https://blog.csdn.net/weixin_44563460/article/details/132100504
import hashlib
import os
import tarfile
import zipfile
import requests
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l

DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'

def download(name,cache_dir=os.path.join('.','data')):
    # 得到的cache_dir = ./data     os.path.join表示路径拼接
    assert name in DATA_HUB, f"{name}不存在于{DATA_HUB}"
    url,sha1_hash = DATA_HUB[name]
    # print(url)  http://d2l-data.s3-accelerate.amazonaws.com/kaggle_house_pred_train.csv
    # print(sha1_hash)  585e9cc93e70b39160e7921475f9bcd7d31219ce
    print(url.split('/')[-1])
    os.makedirs(cache_dir,exist_ok=True)  # 创建多级目录
    fname = os.path.join(cache_dir,url.split('/')[-1])  # .\data\kaggle_house_pred_train.csv
    print(fname)
    if os.path.exists(fname):
        sha1 = hashlib.sha1()   # 加密用的
        with open(fname,'rb') as f:
            while True:
                data = f.read(1048576)   # 每次读取1048576字节
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest()==sha1_hash:
            return fname
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url,stream=True,verify=True)
    with open(fname,'wb') as f:
        f.write(r.content)
    return fname

# 没有用到过
def download_extract(name,folder=None):
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir,ext = os.path.splitext(fname)
    if ext=='.zip':
        fp = zipfile.ZipFile(fname,'r')
    elif ext in ('.tar','.gz'):
        fp = tarfile.open(fname,'r')
    else:
        assert False
    fp.extractall(base_dir)
    return os.path.join(base_dir,folder) if folder else data_dir

# 没有用到过
def download_all():
    for name in DATA_HUB:
        download(name)

DATA_HUB['kaggle_house_train'] = (DATA_URL + 'kaggle_house_pred_train.csv','585e9cc93e70b39160e7921475f9bcd7d31219ce')
DATA_HUB['kaggle_house_test'] = (DATA_URL+'kaggle_house_pred_test.csv','fa19780a7b011d9b009e8bff8e99922a8ee2eb90')


train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))

print("train_data.shape",train_data.shape)
print("test_data.shape",test_data.shape)

# print(train_data.iloc[0:4,[0,1,2,3,-3,-2,-1]])  # 提取0-3行中0,1,2,3,-3,-2,-1列的元素

# 去掉第一列id属性
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))

numeric_features = all_features.dtypes[all_features.dtypes!='object'].index   # 获取数值类型的列名
print("numeric_features",numeric_features)
all_features[numeric_features] = all_features[numeric_features].apply(lambda x:(x-x.mean())/(x.std()))  # lambda 相当于一个匿名函数  x为参数  :后面是函数体
all_features[numeric_features] = all_features[numeric_features].fillna(0)  # 对NA和NAN的值进行填充

all_features = pd.get_dummies(all_features,dummy_na=True)  # 将NA视为有效特征值
print(all_features.shape)

n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values,dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values,dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1,1),dtype=torch.float32)

# 定义损失函数  均方损失函数  平方和
loss = nn.MSELoss()

# 获取列数(特征个数)
in_features = train_features.shape[1]
def get_net():
    net = nn.Sequential(nn.Linear(in_features,1))
    return net

def log_rmse(net,features,labels):
    clipped_preds = torch.clamp(net(features),1,float('inf'))  # 将输出结果锁在1~inf之间
    rmse = torch.sqrt(loss(torch.log(clipped_preds),torch.log(labels)))
    return rmse.item()  # 可以提高显示精度

def train(net,train_features,train_labels,test_features,test_labels,num_epochs,learning_rate,weight_decay,batch_size):
    # 存放log_rmse的
    train_ls = []
    test_ls = []
    train_iter = d2l.load_array((train_features,train_labels),batch_size)
    optimizer = torch.optim.Adam(net.parameters(),lr = learning_rate,weight_decay=weight_decay)
    for epoch in range(num_epochs):
        for x,y in train_iter:
            optimizer.zero_grad()
            l = loss(net(x),y)
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net,train_features,train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net,test_features,test_labels))
    return train_ls,test_ls


def get_k_fold_data(k,i,x,y):
    assert k > 1
    fold_size = x.shape[0] // k
    x_train,y_train = None,None
    for j in range(k):
        idx = slice(j*fold_size,(j+1)*fold_size)
        x_part,y_part = x[idx,:],y[idx]
        # 留出k折中的一份 用于验证
        if j==i:
            x_valid,y_valid = x_part,y_part
        # 如果是第一次给x_train赋值 则直接等
        elif x_train is None:
            x_train,y_train = x_part,y_part
        # 不是第一次给x_train赋值 那就往原来的x_train里面添加
        else:
            x_train = torch.cat([x_train,x_part],0)
            y_train = torch.cat([y_train,y_part],0)
    return x_train,y_train,x_valid,y_valid

def k_fold(k,x_train,y_train,num_epochs,learning_rate,weight_decay,batch_size):
    train_l_sum,valid_l_sum = 0,0
    for i in range(k):
        data = get_k_fold_data(k,i,x_train,y_train)
        net = get_net()
        train_ls,valid_ls = train(net,*data,num_epochs,learning_rate,weight_decay,batch_size)
        # 因为train_ls是个列表 所以最后一个元素就是刚才训练的损失
        train_l_sum = train_l_sum + train_ls[-1]
        valid_l_sum = valid_l_sum + valid_ls[-1]
        if i==0:
            d2l.plot(list(range(1,num_epochs+1)),[train_ls,valid_ls],xlabel='epoch',ylabel='rmse',xlim = [1,num_epochs],legend=['train','valid'],yscale='log')
            # 用于解决d2l.plot不显示的问题
            d2l.plt.show()
        print(f'折{i+1},训练log rmse{float(train_ls[-1])} 验证log rmse{float(valid_ls[-1])}')
    return train_l_sum/k,valid_l_sum/k

k,num_epochs,lr,weight_decay,batch_size = 5,100,5,0,64
train_l,valid_l = k_fold(k,train_features,train_labels,num_epochs,lr,weight_decay,batch_size)
print(f'{k}折交叉验证:平均训练log rmse{float(train_l)}:f,' f'平均验证log rmse{float(valid_l):f}')

def train_and_pred(train_features,test_features,train_labels,test_data,num_epochs,lr,weight_decay,batch_size):
    net = get_net()
    train_ls,_ = train(net,train_features,train_labels,None,None,num_epochs,lr,weight_decay,batch_size)
    d2l.plot(np.arange(1,num_epochs+1),[train_ls],xlabel='epoch',ylabel='log rmse',xlim=[1,num_epochs],yscale='log')
    d2l.plt.show()
    preds = net(test_features).detach().numpy()
    test_data['SalePrice'] = pd.Series(preds.reshape(1,-1)[0])
    submission = pd.concat([test_data['Id'],test_data['SalePrice']],axis=1)
    submission.to_csv('submission.csv',index=False)

train_and_pred(train_features,test_features,train_labels,test_data,num_epochs,lr,weight_decay,batch_size)
在这里插入图片描述