#-*- coding:utf-8 -*-
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from IPython import display
import sys
import matplotlib.pyplot as plt
#处理数据
#使用pandas模块读取csv中数据
train_data = pd.read_csv('../data/kaggle_house_pred_train.csv')
test_data = pd.read_csv('../data/kaggle_house_pred_test.csv')
#将训练样本和测试样本融合在一起组成总样本,作为K折交叉验证的数据
all_features = pd.concat((train_data.iloc[:,1:-1],test_data.iloc[:,1:]))
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(lambda x : (x - x.mean()) /(x.std())) #强大的apply函数
all_features[numeric_features] = all_features[numeric_features].fillna(0)
all_features = pd.get_dummies(all_features, dummy_na=True)
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float)
train_labels = torch.tensor(train_data.SalePrice.values).view((-1,1))#view相当于reshape
dataset = torch.utils.data.TensorDataset(train_features, train_labels)
#定义损失函数模型
loss = nn.MSELoss()#torch中表示平均损失函数
def get_net(feature_num):
net = nn.Linear(feature_num, 1)#全连接层
for param in net.parameters(): #模型中要训练的参数,保存在paramsters()中
nn.init.normal_(param,mean=0,std=0.01)#初始化方法
return net
#定义对数均方根,用于评价模型
def log_rmse(net, features, labels):
with torch.no_grad():#在torch.no_grad() 会影响pytorch的反向传播机制,在测试时因为确定不会使用到反向传播因此 这种模式可以帮助节省内存空间。
clipped_preds = torch.max(net(features), torch.tensor(1.0))
rmse = torch.sqrt(2 * loss(clipped_preds.float().log(), labels.float().log()).mean())
return rmse.item() #生成标量
#训练模型,使用Adam优化算法
def train(net, train_features, train_labels, test_features, test_labels,
num_epochs, learning_rate,weight_decay, batch_size):
train_ls, test_ls = [],[]
#TensorDataset函数生成数据集,通过沿着**第一个维度**索引两个张量来恢复每个样本。https://blog.csdn.net/qq_24503095/article/details/103616129
dataset = torch.utils.data.TensorDataset(train_features, train_labels)
#对数据进行处理生成可迭代对象https://blog.csdn.net/qq_24503095/article/details/103616484
train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
#使用Adam优化算法
optimizer = torch.optim.Adam(params=net.parameters(), lr=learning_rate, weight_decay=weight_decay)
net = net.float()
for epoch in range(num_epochs):
for X, y in train_iter:
l = loss(net(X.float()), y.float())
optimizer.zero_grad()#清空所有被优化过的Variable的梯度.
l.backward() #进行反向传播,更新全连接中参数
optimizer.step() #进行算法优化
train_ls.append(log_rmse(net, train_features, train_labels))
if test_labels is not None:
test_ls.append(log_rmse(net, test_features, test_labels))
return train_ls, test_ls
#K折交叉验证
def get_K_fold_data(k, i, X, y):
#返回第i折交叉验证时所需要的训练和验证数据
assert k > 1
fold_size = X.shape[0] // k
x_train, y_train = None, None
for j in range(k):
idx = slice(j * fold_size, (j + 1) * fold_size)#切片
x_part, y_part = X[idx,:],y[idx]
if j == i:
X_valid, y_valid = x_part, y_part
elif x_train is None:
x_train, y_train = x_part, y_part
else:
x_train = torch.cat((x_train, x_part), dim=0)
y_train = torch.cat((y_train, y_part), dim=0)
return x_train, y_train, X_valid, y_valid
def use_svg_diplay():
display.set_matplotlib_formats('svg')
def set_figsize(figsize=(3.5, 2.5)):
use_svg_diplay()
plt.rcParams['figure.figsize'] = figsize
def semilogy(x_vals, y_vals, x_label, y_label, x2_vals=None, y2_vals=None,
legend=None, figsize=(3.5, 2.5)):
set_figsize(figsize)
plt.xlabel(x_label)
plt.ylabel(y_label)
plt.semilogy(x_vals, y_vals)
if x2_vals and y2_vals:
plt.semilogy(x2_vals, y2_vals, linestyle=':')
plt.legend(legend)
plt.show()
def k_fold(k, X_train, y_train, num_epochs, learing_rate, weight_decay, batch_size):
train_l_sum, valid_l_sum = 0,0
for i in range(k):
data = get_K_fold_data(k, i, X_train, y_train)
net = get_net(X_train.shape[1])
train_ls, valid_ls = train(net, *data, num_epochs, learing_rate, weight_decay, batch_size)
train_l_sum += train_ls[-1]
valid_l_sum += valid_ls[-1]
if i == 0:
semilogy(range(1, num_epochs + 1), train_ls, 'epochs', 'rmse',
range(1, num_epochs + 1), valid_ls, ['train', 'valid'])
print('flod %d, train rmse %f, valid rmse %f' % (i, train_ls[-1], valid_ls[-1]))
return train_l_sum / k, valid_l_sum / k
k, num_epochs, lr, weight_decay, batch_size, = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print('%d-fold validation: avg train rmse %f, avg valid rmse %f'%(k, train_l, valid_l))
pytorch:Kaggle房价预测
最新推荐文章于 2024-09-20 20:32:27 发布