pytorch:Kaggle房价预测

#-*- coding:utf-8 -*-

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from IPython import display
import sys
import matplotlib.pyplot as plt

#处理数据
#使用pandas模块读取csv中数据
train_data = pd.read_csv('../data/kaggle_house_pred_train.csv')
test_data = pd.read_csv('../data/kaggle_house_pred_test.csv')

#将训练样本和测试样本融合在一起组成总样本,作为K折交叉验证的数据
all_features = pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))

numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(lambda x : (x - x.mean()) /(x.std())) #强大的apply函数
all_features[numeric_features] = all_features[numeric_features].fillna(0)

all_features = pd.get_dummies(all_features, dummy_na=True)
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)
train_labels = torch.tensor(train_data.SalePrice.values).view((-1,1))#view相当于reshape
dataset = torch.utils.data.TensorDataset(train_features, train_labels)

#定义损失函数模型
loss = nn.MSELoss()#torch中表示平均损失函数
def get_net(feature_num):
    net = nn.Linear(feature_num, 1)#全连接层
    for param in net.parameters():   #模型中要训练的参数,保存在paramsters()中
        nn.init.normal_(param,mean=0,std=0.01)#初始化方法
    return net

#定义对数均方根,用于评价模型
def log_rmse(net, features, labels):
    with torch.no_grad():#在torch.no_grad() 会影响pytorch的反向传播机制,在测试时因为确定不会使用到反向传播因此 这种模式可以帮助节省内存空间。
        clipped_preds = torch.max(net(features), torch.tensor(1.0))
        rmse = torch.sqrt(2 * loss(clipped_preds.float().log(), labels.float().log()).mean())
    return rmse.item()    #生成标量

#训练模型,使用Adam优化算法
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate,weight_decay, batch_size):
    train_ls, test_ls = [],[]
    #TensorDataset函数生成数据集,通过沿着**第一个维度**索引两个张量来恢复每个样本。https://blog.csdn.net/qq_24503095/article/details/103616129
    dataset = torch.utils.data.TensorDataset(train_features, train_labels)
    #对数据进行处理生成可迭代对象https://blog.csdn.net/qq_24503095/article/details/103616484
    train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
    #使用Adam优化算法
    optimizer = torch.optim.Adam(params=net.parameters(), lr=learning_rate, weight_decay=weight_decay)
    net = net.float()
    for epoch in range(num_epochs):
        for X, y in train_iter:
            l = loss(net(X.float()), y.float())
            optimizer.zero_grad()#清空所有被优化过的Variable的梯度.
            l.backward()  #进行反向传播,更新全连接中参数
            optimizer.step() #进行算法优化
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls

#K折交叉验证
def get_K_fold_data(k, i, X, y):
    #返回第i折交叉验证时所需要的训练和验证数据
    assert k > 1
    fold_size = X.shape[0] // k
    x_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)#切片
        x_part, y_part = X[idx,:],y[idx]
        if j == i:
            X_valid, y_valid = x_part, y_part
        elif x_train is None:
            x_train, y_train = x_part, y_part
        else:
            x_train = torch.cat((x_train, x_part), dim=0)
            y_train = torch.cat((y_train, y_part), dim=0)
    return x_train, y_train, X_valid, y_valid

def use_svg_diplay():
    display.set_matplotlib_formats('svg')
def set_figsize(figsize=(3.5, 2.5)):
    use_svg_diplay()
    plt.rcParams['figure.figsize'] = figsize

def semilogy(x_vals, y_vals, x_label, y_label, x2_vals=None, y2_vals=None,
             legend=None, figsize=(3.5, 2.5)):
    set_figsize(figsize)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.semilogy(x_vals, y_vals)
    if x2_vals and y2_vals:
        plt.semilogy(x2_vals, y2_vals, linestyle=':')
        plt.legend(legend)
    plt.show()

def k_fold(k, X_train, y_train, num_epochs, learing_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0,0
    for i in range(k):
        data = get_K_fold_data(k, i, X_train, y_train)
        net = get_net(X_train.shape[1])
        train_ls, valid_ls = train(net, *data, num_epochs, learing_rate, weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i == 0:
            semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
                     range(1, num_epochs + 1), valid_ls, ['train', 'valid'])
        print('flod %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))
    return train_l_sum / k, valid_l_sum / k
k, num_epochs, lr, weight_decay, batch_size, = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print('%d-fold validation: avg train rmse %f, avg valid rmse %f'%(k, train_l, valid_l))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值