深度学习笔记--Kaggle比赛之房价预测

#获取和读取数据
import torch
import torch.nn as nn
import pandas as pd            #处理数据
import All_function as func    #自定义包
torch.set_default_tensor_type(torch.FloatTensor)

train_data=pd.read_csv('Kaggle_house/train.csv')  # 样本,特征,标签:1460,80,1(79个特征)
test_data=pd.read_csv('Kaggle_house/test.csv')    # (1459,80)(79个特征)

#特征连接
all_features=pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))

#数据预处理
#1.连续值处理
numeric_features=all_features.dtypes[all_features.dtypes!='object'].index
all_features[numeric_features]=all_features[numeric_features].apply(lambda x:(x-x.mean())/(x.std()))
all_features[numeric_features]=all_features[numeric_features].fillna(0)

#2.离散值处理
all_features=pd.get_dummies(all_features,dummy_na=True)   #处理后的all_features有331列


#训练和测试数据
num_train=train_data.shape[0]
train_features=torch.tensor(all_features[:num_train].values,dtype=torch.float)
test_features=torch.tensor(all_features[num_train:].values,dtype=torch.float)
train_labels=torch.tensor(train_data.SalePrice.values,dtype=torch.float).view(-1,1) #(1460,1)


#训练模型
loss=torch.nn.MSELoss()
def get_net(feature_num):
    net=nn.Linear(feature_num,1)
    for param in net.parameters():
        nn.init.normal_(param,mean=0,std=0.01)
    return net

#比赛用来评价模型的对数均方根误差
def log_rmse(net,features,labels):
    with torch.no_grad():     #不创建计算图
        #将小于1的值设置为1,使得取对数时数值更稳定
        clippes_pred=torch.max(net(features),torch.tensor(1.0))
        rmse=torch.sqrt(2*loss(clippes_pred.log(),labels.log()).mean())
    return rmse.item()

#训练
def train(net, train_features, train_labels, test_features, test_labels,num_epochs, learning_rate, weight_decay, batch_size):   #使用Adam优化算法
    train_ls, test_ls = [], []
    dataset = torch.utils.data.TensorDataset(train_features, train_labels)
    train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
    # 这里使用了Adam优化算法
    optimizer = torch.optim.Adam(params=net.parameters(), lr=learning_rate, weight_decay=weight_decay)
    net = net.float()
    for epoch in range(num_epochs):
        for X, y in train_iter:
            l = loss(net(X.float()), y.float())
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
        train_ls.append(log_rmse(net, train_features, train_labels))
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls

#K折交叉验证,选择模型设计并调节超参数
def get_k_fold_data(k, i, X, y):
    # 返回第i折交叉验证时所需要的训练和验证数据
    assert k > 1
    fold_size = X.shape[0] // k   #把原始数据集分割成k份
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part          #第i折为验证集,其余k-1折为训练集
        elif X_train is None:                          #第一折
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat((X_train, X_part), dim=0)
            y_train = torch.cat((y_train, y_part), dim=0)
    return X_train, y_train, X_valid, y_valid          #返回训练集和验证集

#在K折交叉验证中训练K次并返回训练和验证的平均误差
def k_fold(k, X_train, y_train, num_epochs,
           learning_rate, weight_decay, batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net(X_train.shape[1])
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        if i == 0:
            func.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
                          range(1, num_epochs + 1), valid_ls,
                         ['train', 'valid'])
        print('fold %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))
    return train_l_sum / k, valid_l_sum / k

#模型选择
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print('%d-fold validation: avg train rmse %f, avg valid rmse %f' % (k, train_l, valid_l))



#预测并打包结果
#用所有的训练集重新训练并预测房屋价格,如果得到与K折交叉验证时差不多的训练误差,则这个结果很可能是可靠的
def train_and_pred(train_features, test_features, train_labels, test_data,
                   num_epochs, lr, weight_decay, batch_size):
    net = get_net(train_features.shape[1])
    train_ls, _ = train(net, train_features, train_labels, None, None,
                        num_epochs, lr, weight_decay, batch_size)
    func.semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse')
    print('train rmse %f' % train_ls[-1])
    preds = net(test_features).detach().numpy()            #detach()隔断参数,不参与更新
    test_data['SalePrice'] = pd.Series(preds.reshape(1, -1)[0])
    submission = pd.concat([test_data['Id'], test_data['SalePrice']], axis=1)
    submission.to_csv('./submission.csv', index=False)

train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size)

All_function包 :

import matplotlib.pyplot as plt
from IPython import display


def set_figsize(figsize=(3.5, 2.5)):
    use_svg_display()
    # 设置图的尺寸
    plt.rcParams['figure.figsize'] = figsize

def use_svg_display():
    # svg:Scalable Vector Graphicas 可缩放矢量图
    """Use svg format to display plot in jupyter"""
    display.set_matplotlib_formats('svg')
def semilogy(x_vals, y_vals, x_label, y_label, x2_vals=None, y2_vals=None,
             legend=None, figsize=(3.5, 2.5)):
    set_figsize(figsize)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.semilogy(x_vals, y_vals)
    if x2_vals and y2_vals:
        plt.semilogy(x2_vals, y2_vals, linestyle=':')
        plt.legend(legend)
    plt.show()

结果:

 

fold 0, train rmse 0.239903, valid rmse 0.221246
fold 1, train rmse 0.229439, valid rmse 0.269979
fold 2, train rmse 0.231695, valid rmse 0.238354
fold 3, train rmse 0.237770, valid rmse 0.218455
fold 4, train rmse 0.230555, valid rmse 0.258525
5-fold validation: avg train rmse 0.233873, avg valid rmse 0.241312

train rmse 0.229990

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值