HW1 回归预测

该文介绍了一个使用PyTorch构建的神经网络模型,用于预测COVID-19的患病情况。模型基于多个病例特征进行训练,通过数据分片、特征选择和训练验证集划分来防止过拟合。模型结构包含多层线性层和ReLU激活函数,使用随机梯度下降优化器并设置了早停策略。训练过程中监控损失函数,保存最佳模型,并最终对测试数据进行预测,结果存储在anser.csv中。
摘要由CSDN通过智能技术生成

HW1 回归预测

本题来源于台大,本人写了个废物模型(方差收敛到41),然后看了看标准答案(方差收敛到2)。
数据样本连接:
https://www.kaggle.com/competitions/ml2022spring-hw1/code

问题描述,根据2000个病例特征属性值,预测covid19的患病情况。

数据处理

  1. 导入数据
    使用numpy的loadtxt方法导入存在csv中的数据,且跳过第一行(属性名称)。
train_data = np.loadtxt('covid.train.csv', delimiter=',', dtype=np.float32,skiprows=1)
test_data = np.loadtxt('covid.test.csv', delimiter=',', dtype=np.float32, skiprows=1)
  1. 分片
    通过python的分片功能将数据分为测试集和训练集,训练集用来训练模型,测试集用来查看是否出现过拟合现象,及时止损。
    这里给出的测试集比例为0.2,随机数种子设置为2023
def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    print(seed)
    valid_set_size = int(valid_ratio * len(data_set))
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)
  1. 选择特征
    选择合适的属性进行预测,这里选择的是全部属性。
def select_feat(train_data, valid_data, test_data, select_all=True):
    y_train, y_valid = train_data[:, -1], valid_data[:, -1]
    raw_x_train, raw_x_valid, raw_x_test = train_data[:, :-1], valid_data[:, :-1], test_data
    if select_all:
        feat_idx = list(range(raw_x_train.shape[1]))
    else:
        feat_idx = [0, 1, 2, 3, 4]
    return raw_x_train[:, feat_idx], raw_x_valid[:, feat_idx], raw_x_test[:, feat_idx], y_train, y_valid

4.将numpy转化为torch.FloatTensor类型

class CovidDataSet(Dataset):
    def __init__(self, x, y=None):
        if y is None:
            self.y = None
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)
  1. 神经网络
    模型输入层神经元个数是covid19的属性个数,使用的激活函数为ReLu函数,中间经过三次空间变换,输出层为一个单元。
class Model(torch.nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
        self.layers = torch.nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 1),
            nn.ReLu()
        )

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1)
        return x
  1. 训练池
device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = {
    'seed': 2023,
    'select_all': 'True',
    'test_ratio': 0.2,
    'n_epochs': 3000,
    'batch_size': 256,
    'learn_rate': 1e-5,
    'early_stop': 400,
    'save_path': './model.ckpt'
}

输入训练集,测试集,模型,参数,以及设备类型
总共训练3000次,损失函数使用方差(MSE),优化器采用随机梯度下降,学习率=0.00001,测试集连续400不收敛就终止训练。

def trainer(train_loader, valid_loader, model, config, device):
    criterion = torch.nn.MSELoss(reduction='mean')
    optimizer = torch.optim.SGD(model.parameters(), lr=config['learn_rate'], momentum=0.9)
    writer = SummaryWriter()  # Writer of tensoboard.

    if not os.path.isdir('./models'):
        os.mkdir('./models')  # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train()  # Set your model to train mode.
        loss_record = []

        # tqdm is a package to visualize your training progress.
        train_pbar = tqdm(train_loader, position=0, leave=True)

        for x, y in train_pbar:
            optimizer.zero_grad()  # Set gradient to zero.
            x, y = x.to(device), y.to(device)  # Move your data to device.
            pred = model(x)
            loss = criterion(pred, y)
            loss.backward()  # Compute gradient(backpropagation).
            optimizer.step()  # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())

            # Display current epoch number and loss on tqdm progress bar.
            train_pbar.set_description(f'Epoch [{epoch + 1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})

        mean_train_loss = sum(loss_record) / len(loss_record)
        writer.add_scalar('Loss/train', mean_train_loss, step)

        model.eval()  # Set your model to evaluation mode.
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)

            loss_record.append(loss.item())

        mean_valid_loss = sum(loss_record) / len(loss_record)
        print(
            f'Epoch [{epoch + 1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        writer.add_scalar('Loss/valid', mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path'])  # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else:
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            return

总代码

# Numerical Operations
import math
import numpy as np

# Reading/Writing Data
import pandas as pd
import os
import csv
# For Progress Bar
from tqdm import tqdm

# Pytorch
import torch 
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
# In[1]:Configurations
device = 'cuda' if torch.cuda.is_available() else 'cpu'

config = {
    'seed': 2023,
    'select_all': 'True',
    'test_ratio': 0.2,
    'n_epochs': 3000,
    'batch_size': 256,
    'learn_rate': 1e-5,
    'early_stop': 400,
    'save_path': './model.ckpt'
}

# In[2]:DataSet
class CovidDataSet(Dataset):
    def __init__(self, x, y=None):
        if y is None:
            self.y = None
        else:
            self.y = torch.FloatTensor(y)
        self.x = torch.FloatTensor(x)

    def __getitem__(self, idx):
        if self.y is None:
            return self.x[idx]
        else:
            return self.x[idx], self.y[idx]

    def __len__(self):
        return len(self.x)

def select_feat(train_data, valid_data, test_data, select_all=True):
    y_train, y_valid = train_data[:, -1], valid_data[:, -1]
    raw_x_train, raw_x_valid, raw_x_test = train_data[:, :-1], valid_data[:, :-1], test_data
    if select_all:
        feat_idx = list(range(raw_x_train.shape[1]))
    else:
        feat_idx = [0, 1, 2, 3, 4]
    return raw_x_train[:, feat_idx], raw_x_valid[:, feat_idx], raw_x_test[:, feat_idx], y_train, y_valid
# In[3]: model
class Model(torch.nn.Module):
    def __init__(self, input_dim):
        super(Model, self).__init__()
        self.layers = torch.nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 1),
            nn.ReLU()
        )

    def forward(self, x):
        x = self.layers(x)
        x = x.squeeze(1)
        return x


# In[4]: train pool
def trainer(train_loader, valid_loader, model, config, device):
    criterion = torch.nn.MSELoss(reduction='mean')
    optimizer = torch.optim.SGD(model.parameters(), lr=config['learn_rate'], momentum=0.9)
    writer = SummaryWriter()  # Writer of tensoboard.

    if not os.path.isdir('./models'):
        os.mkdir('./models')  # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train()  # Set your model to train mode.
        loss_record = []

        # tqdm is a package to visualize your training progress.
        train_pbar = tqdm(train_loader, position=0, leave=True)

        for x, y in train_pbar:
            optimizer.zero_grad()  # Set gradient to zero.
            x, y = x.to(device), y.to(device)  # Move your data to device.
            pred = model(x)
            loss = criterion(pred, y)
            loss.backward()  # Compute gradient(backpropagation).
            optimizer.step()  # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())

            # Display current epoch number and loss on tqdm progress bar.
            train_pbar.set_description(f'Epoch [{epoch + 1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})

        mean_train_loss = sum(loss_record) / len(loss_record)
        writer.add_scalar('Loss/train', mean_train_loss, step)

        model.eval()  # Set your model to evaluation mode.
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)

            loss_record.append(loss.item())

        mean_valid_loss = sum(loss_record) / len(loss_record)
        print(
            f'Epoch [{epoch + 1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
        writer.add_scalar('Loss/valid', mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path'])  # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else:
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            return


        '''
        y_pred = model(x_data)
        loss = criterion(y_pred, y_data)

        loss_list.append(loss.data)
        print(loss.data.item())

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        '''
# In[6]: split
def train_valid_split(data_set, valid_ratio, seed):
    '''Split provided training data into training set and validation set'''
    print(seed)
    valid_set_size = int(valid_ratio * len(data_set))
    train_set_size = len(data_set) - valid_set_size
    train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
    return np.array(train_set), np.array(valid_set)

# In[7]: go
same_seed = config['seed']
train_data = np.loadtxt('covid.train.csv', delimiter=',', dtype=np.float32,skiprows=1)
test_data = np.loadtxt('covid.test.csv', delimiter=',', dtype=np.float32, skiprows=1)
train_data, valid_data = train_valid_split(train_data,config['test_ratio'],config['seed'])
print(f"""train_data size: {train_data.shape} 
valid_data size: {valid_data.shape} 
test_data size: {test_data.shape}""")
x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['select_all'])
train_dataset, valid_dataset, test_dataset = CovidDataSet(x_train, y_train), \
                                            CovidDataSet(x_valid, y_valid), \
                                            CovidDataSet(x_test)
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)
model = Model(input_dim=x_train.shape[1]).to(device)
trainer(train_loader,valid_loader,model,config,device)


y_test=model(torch.FloatTensor(test_data))
y_data=pd.DataFrame({'anser': y_test.tolist()})
y_data.to_csv('anser.csv',mode='a', header=True,index=None)

结果存入anser.csv
在这里插入图片描述
在这里插入图片描述

总结

  1. 随机数种子可以用来复现训练
  2. DataLoader可以分批处理数据
  3. 将数据分为训练集和测试集可以防止过拟合,将来的预测会更加准确
  4. 使用GPU来训练模型可以提高处理速度,有效减少内存的消耗,使用cpu此模型只能训练1000轮。
  5. 学习率要合适才能更好的收敛,过高会导致参数发散,损失值不下降。过低会导致学习效率缓慢,轮数过多。
  6. ReLu的学习效果要优于sigmoid
  7. 神经网络的设计是个玄学问题,层数多了有时候效果也不太好。
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值