PyTorch示例——Dataset、DataLoader

39 篇文章 2 订阅
12 篇文章 0 订阅

版本信息

  • PyTorch: 1.12.1
  • Python: 3.7.13

导包

import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
import pandas as pd

原始数据

# 先用pandas看一下
train_path = "./data/kaggle_house_pred_train.csv"
train_data = pd.read_csv(train_path)

train_data[:10]

构建Dataset

class HouseDataset(TensorDataset):
    
    def __init__(self, file_path):
        data = pd.read_csv(file_path)
        self.len = data.shape[0]
        features = self.__prehandle_X(data.iloc[:, 1:-1])
        labels = data.iloc[:, [-1]]
        self.X_data = torch.tensor(features.values, dtype=torch.float32)
        self.y_data = torch.tensor(labels.values, dtype=torch.float32)
        
    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len
    
    def __prehandle_X(self, orign_features):
        """
        特征数据预处理
        """
        # 删除掉空值较多的列
        features = orign_features.drop(labels=[
            'BsmtFinSF2', 'MasVnrArea', '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath', 
            'BsmtHalfBath', 'HalfBath', 'WoodDeckSF', 'EnclosedPorch', '3SsnPorch', 
            'ScreenPorch', 'PoolArea', 'MiscVal'
        ], axis=1)

        # 数值类型数据,标准化
        numeric_features = features.dtypes[features.dtypes != 'object'].index
        features[numeric_features] = features[numeric_features].apply(
            lambda x: (x - x.mean()) / (x.std())
        ).fillna(0)

        # 离散型数据,One-hot编码
        features = pd.get_dummies(features, dummy_na=True)
        return features
    
    def get_X_shape1(self):
        return self.X_data.shape[1]
  • 自己构造TensorDataset的子class,可以在大量数据的情况下,不一次加载完所有数据,而是每次调用__getitem__才真正加载一条数据
  • 目前上面的样例是直接加载所有数据的(可以方便统一只记一种写法)
  • 如果先直接加载所有数据的话,也可以有简单的写法,如下
# 可以直接将特征Tensor、标签Tensor直接传给TensorDataset
features = prehandle_X(train_data.iloc[:, 1:-1])  # 单独再写个prehandle_X方法
train_features = torch.tensor(features.values, dtype=torch.float32)
train_labels = torch.tensor(train_data.iloc[:, [-1]].values, dtype=torch.float32)

dataset = torch.utils.data.TensorDataset(train_features, train_labels)

train_dataloader = data.DataLoader(dataset, batch_size, shuffle=True)

构建模型

class MyModel(torch.nn.Module):
    
    def __init__(self, in_features):
        super(MyModel, self).__init__()
        self.linear1 = torch.nn.Linear(in_features, 128)
        self.linear2 = torch.nn.Linear(128, 64)
        self.linear3 = torch.nn.Linear(64, 1)
        self.relu = torch.nn.ReLU()
        
    def forward(self, X):
        output = self.linear1(X)
        output = self.relu(output)
        output = self.linear2(output)
        output = self.relu(output)
        output = self.linear3(output)
        return output

开始训练

# 参数
learning_rate = 0.005
epoch_num = 100
batch_size = 64

# 数据
dataset = HouseDataset(train_path)
train_dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)

# 模型
mymodel = MyModel(in_features = dataset.get_X_shape1())
# 优化器、损失计算
optimizer = torch.optim.Adam(mymodel.parameters(), lr = learning_rate)
loss = torch.nn.MSELoss()

# 开始训练
train_ls = []
for epoch in range(epoch_num):
    for X_train, y_train in train_dataloader:
        y_pred = mymodel(X_train)
        l = loss(y_pred, y_train) 
        
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
    
    train_ls.append(l.item())

绘制曲线:epoch与loss

import matplotlib.pyplot as plt

plt.figure(figsize=(12.8, 7.2))
plt.plot(range(0, num_epochs), train_ls, label="train")
plt.legend()
plt.show()

损失曲线

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值