# coding=utf-8
import numpy as np
import pandas as pd
import torch
from d2l import torch as d2l
from torch import nn
train_data = pd.read_csv("G:\\nlp\\limu-practice\\house-prices-advanced-regression-techniques\\train.csv")
test_data = pd.read_csv("G:\\nlp\\limu-practice\\house-prices-advanced-regression-techniques\\test.csv")
# print(train_data.shape) (1460, 81)
# print(test_data.shape) (1459, 80)
# 查看前4个样本的前4个特征、后2个特征和标签(销售价格)
# print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])
# 将每个样本中的ID特征,从数据集中删除,iloc用于数据集中列的切割
all_feature = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
# 将所有缺失的值替换为相应的特征的平均值
# 筛选出数据框(all_feature)中数值类型的特征列
numeric_features = all_feature.dtypes[all_feature.dtypes != "object"].index
# 对数据框(all_feature)中数值类型的特征列进行标准化处理,这样处理后,数值类型的特征列将具有零均值和单位方差。
all_feature[numeric_features] = all_feature[numeric_features].apply(lambda x: (x - x.mean()) / (x.std()))
# 将数据框(all_feature)中数值类型的特征列中的缺失值(NaN)替换为0
all_feature[numeric_features] = all_feature[numeric_features].fillna(0)
# 处理离散值,将离散值,转换为虚拟变量(将离散数值变为可处理的连续性数值)
# dummy_na=True将缺失值也当作合法的特征值并为其创建指示特征
all_feature = pd.get_dummies(all_feature, dummy_na=True)
# print(all_feature.shape)
# 从pandas格式中提取Numpy格式,并转化为张量形式
# print(train_data.shape[0])1460
# 由于读入的numpy数组里的元素是object类型,无法将这种类型转换成tensor,加入astype(float)强制类型转换
n_train = train_data.shape[0]
train_features = torch.tensor(all_feature[:n_train].values.astype(float), dtype=torch.float32)
test_features = torch.tensor(all_feature[:n_train].values.astype(float), dtype=torch.float32)
# SalePrice 列的值转换为一个形状为 (-1, 1) 的张量,并将其数据类型设置为 torch.float32。
# 其中,-1 表示自动计算该维度的大小,以适应所有元素。
train_labels = torch.tensor(train_data.SalePrice.values.astype(float).reshape(-1, 1), dtype=torch.float32)
# 定义训练模型
loss = torch.nn.MSELoss()
'''def get_net(feature_num):
net = nn.Linear(feature_num, 1)
for param in net.parameters():
nn.init.normal_(param, mean=0, std=0.01)
return net
'''
def get_net(feature_num):
net=nn.Sequential(
nn.Linear(feature_num,128),
nn.ReLU(),
nn.Linear(128,1),
#nn.ReLU(),
#nn.Linear(256,1)
)
return net
# 函数的主要目的是计算网络预测值与实际标签之间的均方根误差(RMSE),
# 对于比较大的数据,一般更关心相对误差,一般添加log,取对数
def log_rmse(net, features, labels):
with torch.no_grad():
# 将小于1的值设为1,使得取对数时数值更稳定
clipped_preds = torch.max(net(features), torch.tensor(1.0))
rmse = torch.sqrt(loss(clipped_preds.log(), labels.log()))
return rmse.item()
# 使用Adam优化函数
def train(net, train_features, train_labels, test_features, test_labels, num_epochs, learing_rate, weight_decay,
batch_size):
train_ls, test_ls = [], []
# dataset=torch.utils.data.TensorDataset(train_features,train_labels)
# dataset = torch.utils.data.TensorDataset(train_features, train_labels)
# print(type(dataset))
# train_iter=torch.utils.data.TensorDataset(dataset)
dataset = torch.utils.data.TensorDataset(train_features, train_labels)
train_iter = torch.utils.data.DataLoader(dataset, batch_size, shuffle=True)
# 使用Adam优化算法
optimizer = torch.optim.Adam(params=net.parameters(), lr=learing_rate, weight_decay=weight_decay)
net = net.float()
for epoch in range(num_epochs):
for x, y in train_iter:
l = loss(net(x.float()), y.float())
optimizer.zero_grad()
l.backward()
optimizer.step()
train_ls.append(log_rmse(net, train_features, train_labels))
#测试集用完为止
if test_labels is not None:
test_ls.append(log_rmse(net, test_features, test_labels))
return train_ls, test_ls
# 实现K折交叉验证,用于将数据集x和标签y分成k个折,并返回第i折的训练数据和验证数据
def get_k_fold_data(k, i, x, y):
assert k > 1
# 每一折大小
fold_size = x.shape[0] // k
x_train, y_train = None, None
for j in range(k):
idx = slice(j * fold_size, (j + 1) * fold_size)
x_part, y_part = x[idx, :], y[idx]
if j == i:
x_vaild, y_vaild = x_part, y_part
elif x_train is None:
x_train, y_train = x_part, y_part
else:
# 沿着第0维进行拼接
x_train = torch.cat((x_train, x_part), dim=0)
y_train = torch.cat((y_train, y_part), dim=0)
return x_train, y_train, x_vaild, y_vaild
#下面定义k=5
# 找训练和验证的平均行程
def k_fold(k, x_train, y_train, num_epochs, learning_rate, weight_decay, batch_size):
train_l_sum, vaild_l_sum = 0, 0
for i in range(k):
data = get_k_fold_data(k, i, x_train, y_train)
net = get_net(x_train.shape[1])
# *data:是解包后的数据
train_ls, vaild_ls = train(net, *data, num_epochs, learning_rate, weight_decay, batch_size)
train_l_sum += train_ls[-1]
vaild_l_sum += vaild_ls[-1]
if i == 0:
# 绘制对数坐标图
# d2l.semilogy(range(1,num_epochs+1),train_ls,"epochs","rmse",range(1,num_epochs+1),vaild_ls,["train","valid"])
d2l.plot(list(range(1, num_epochs + 1)), [train_ls, vaild_ls], xlabel="epoch", ylabel="rmse",
xlim=[1, num_epochs],
legend=["train", "valid"], yscale="log")
# d2l.plt.show()
print("fold %d,train rmse %f,valid rmse %f" % (i, train_ls[-1], vaild_ls[-1]))
return train_l_sum / k, vaild_l_sum / k
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_labels, num_epochs, lr, weight_decay, batch_size)
print('%d-fold validation: avg train rmse %f, avg valid rmse %f' % (k, train_l, valid_l))
# 重新训练模型,并且进行数据预测
def train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size):
net = get_net(train_features.shape[1])
train_ls, _ = train(net, train_features, train_labels, None, None, num_epochs, lr, weight_decay, batch_size)
d2l.plot(np.arange(1, num_epochs + 1), [train_ls], xlabel="epoch", ylabel="log-rmse", xlim=[1, num_epochs],
yscale="log")
print("train rmse %f " % train_ls[-1])
preds = net(train_features).detach().numpy()
test_data["SalePrice"] = pd.Series(preds.reshape(1, -1)[0])
submission = pd.concat([test_data["Id"], test_data["SalePrice"]], axis=1)
submission.to_csv(" submission.csv", index=False)
train_and_pred(train_features, test_features, train_labels, test_data, num_epochs, lr, weight_decay, batch_size)
d2l.plt.show()
Kaggle比赛-房屋价格预测的简单实现
最新推荐文章于 2024-08-14 22:29:49 发布