HW1 回归预测
本题来源于台大,本人写了个废物模型(方差收敛到41),然后看了看标准答案(方差收敛到2)。
数据样本连接:
https://www.kaggle.com/competitions/ml2022spring-hw1/code
问题描述,根据2000个病例特征属性值,预测covid19的患病情况。
数据处理
- 导入数据
使用numpy的loadtxt方法导入存在csv中的数据,且跳过第一行(属性名称)。
train_data = np.loadtxt('covid.train.csv', delimiter=',', dtype=np.float32,skiprows=1)
test_data = np.loadtxt('covid.test.csv', delimiter=',', dtype=np.float32, skiprows=1)
- 分片
通过python的分片功能将数据分为测试集和训练集,训练集用来训练模型,测试集用来查看是否出现过拟合现象,及时止损。
这里给出的测试集比例为0.2,随机数种子设置为2023
def train_valid_split(data_set, valid_ratio, seed):
'''Split provided training data into training set and validation set'''
print(seed)
valid_set_size = int(valid_ratio * len(data_set))
train_set_size = len(data_set) - valid_set_size
train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
return np.array(train_set), np.array(valid_set)
- 选择特征
选择合适的属性进行预测,这里选择的是全部属性。
def select_feat(train_data, valid_data, test_data, select_all=True):
y_train, y_valid = train_data[:, -1], valid_data[:, -1]
raw_x_train, raw_x_valid, raw_x_test = train_data[:, :-1], valid_data[:, :-1], test_data
if select_all:
feat_idx = list(range(raw_x_train.shape[1]))
else:
feat_idx = [0, 1, 2, 3, 4]
return raw_x_train[:, feat_idx], raw_x_valid[:, feat_idx], raw_x_test[:, feat_idx], y_train, y_valid
4.将numpy转化为torch.FloatTensor类型
class CovidDataSet(Dataset):
def __init__(self, x, y=None):
if y is None:
self.y = None
else:
self.y = torch.FloatTensor(y)
self.x = torch.FloatTensor(x)
def __getitem__(self, idx):
if self.y is None:
return self.x[idx]
else:
return self.x[idx], self.y[idx]
def __len__(self):
return len(self.x)
- 神经网络
模型输入层神经元个数是covid19的属性个数,使用的激活函数为ReLu函数,中间经过三次空间变换,输出层为一个单元。
class Model(torch.nn.Module):
def __init__(self, input_dim):
super(Model, self).__init__()
self.layers = torch.nn.Sequential(
nn.Linear(input_dim, 16),
nn.ReLU(),
nn.Linear(16, 8),
nn.ReLU(),
nn.Linear(8, 1),
nn.ReLu()
)
def forward(self, x):
x = self.layers(x)
x = x.squeeze(1)
return x
- 训练池
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
'seed': 2023,
'select_all': 'True',
'test_ratio': 0.2,
'n_epochs': 3000,
'batch_size': 256,
'learn_rate': 1e-5,
'early_stop': 400,
'save_path': './model.ckpt'
}
输入训练集,测试集,模型,参数,以及设备类型
总共训练3000次,损失函数使用方差(MSE),优化器采用随机梯度下降,学习率=0.00001,测试集连续400不收敛就终止训练。
def trainer(train_loader, valid_loader, model, config, device):
criterion = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=config['learn_rate'], momentum=0.9)
writer = SummaryWriter() # Writer of tensoboard.
if not os.path.isdir('./models'):
os.mkdir('./models') # Create directory of saving models.
n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0
for epoch in range(n_epochs):
model.train() # Set your model to train mode.
loss_record = []
# tqdm is a package to visualize your training progress.
train_pbar = tqdm(train_loader, position=0, leave=True)
for x, y in train_pbar:
optimizer.zero_grad() # Set gradient to zero.
x, y = x.to(device), y.to(device) # Move your data to device.
pred = model(x)
loss = criterion(pred, y)
loss.backward() # Compute gradient(backpropagation).
optimizer.step() # Update parameters.
step += 1
loss_record.append(loss.detach().item())
# Display current epoch number and loss on tqdm progress bar.
train_pbar.set_description(f'Epoch [{epoch + 1}/{n_epochs}]')
train_pbar.set_postfix({'loss': loss.detach().item()})
mean_train_loss = sum(loss_record) / len(loss_record)
writer.add_scalar('Loss/train', mean_train_loss, step)
model.eval() # Set your model to evaluation mode.
loss_record = []
for x, y in valid_loader:
x, y = x.to(device), y.to(device)
with torch.no_grad():
pred = model(x)
loss = criterion(pred, y)
loss_record.append(loss.item())
mean_valid_loss = sum(loss_record) / len(loss_record)
print(
f'Epoch [{epoch + 1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
writer.add_scalar('Loss/valid', mean_valid_loss, step)
if mean_valid_loss < best_loss:
best_loss = mean_valid_loss
torch.save(model.state_dict(), config['save_path']) # Save your best model
print('Saving model with loss {:.3f}...'.format(best_loss))
early_stop_count = 0
else:
early_stop_count += 1
if early_stop_count >= config['early_stop']:
print('\nModel is not improving, so we halt the training session.')
return
总代码
# Numerical Operations
import math
import numpy as np
# Reading/Writing Data
import pandas as pd
import os
import csv
# For Progress Bar
from tqdm import tqdm
# Pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
import matplotlib.pyplot as plt
from torch.utils.tensorboard import SummaryWriter
# In[1]:Configurations
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
'seed': 2023,
'select_all': 'True',
'test_ratio': 0.2,
'n_epochs': 3000,
'batch_size': 256,
'learn_rate': 1e-5,
'early_stop': 400,
'save_path': './model.ckpt'
}
# In[2]:DataSet
class CovidDataSet(Dataset):
def __init__(self, x, y=None):
if y is None:
self.y = None
else:
self.y = torch.FloatTensor(y)
self.x = torch.FloatTensor(x)
def __getitem__(self, idx):
if self.y is None:
return self.x[idx]
else:
return self.x[idx], self.y[idx]
def __len__(self):
return len(self.x)
def select_feat(train_data, valid_data, test_data, select_all=True):
y_train, y_valid = train_data[:, -1], valid_data[:, -1]
raw_x_train, raw_x_valid, raw_x_test = train_data[:, :-1], valid_data[:, :-1], test_data
if select_all:
feat_idx = list(range(raw_x_train.shape[1]))
else:
feat_idx = [0, 1, 2, 3, 4]
return raw_x_train[:, feat_idx], raw_x_valid[:, feat_idx], raw_x_test[:, feat_idx], y_train, y_valid
# In[3]: model
class Model(torch.nn.Module):
def __init__(self, input_dim):
super(Model, self).__init__()
self.layers = torch.nn.Sequential(
nn.Linear(input_dim, 16),
nn.ReLU(),
nn.Linear(16, 8),
nn.ReLU(),
nn.Linear(8, 1),
nn.ReLU()
)
def forward(self, x):
x = self.layers(x)
x = x.squeeze(1)
return x
# In[4]: train pool
def trainer(train_loader, valid_loader, model, config, device):
criterion = torch.nn.MSELoss(reduction='mean')
optimizer = torch.optim.SGD(model.parameters(), lr=config['learn_rate'], momentum=0.9)
writer = SummaryWriter() # Writer of tensoboard.
if not os.path.isdir('./models'):
os.mkdir('./models') # Create directory of saving models.
n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0
for epoch in range(n_epochs):
model.train() # Set your model to train mode.
loss_record = []
# tqdm is a package to visualize your training progress.
train_pbar = tqdm(train_loader, position=0, leave=True)
for x, y in train_pbar:
optimizer.zero_grad() # Set gradient to zero.
x, y = x.to(device), y.to(device) # Move your data to device.
pred = model(x)
loss = criterion(pred, y)
loss.backward() # Compute gradient(backpropagation).
optimizer.step() # Update parameters.
step += 1
loss_record.append(loss.detach().item())
# Display current epoch number and loss on tqdm progress bar.
train_pbar.set_description(f'Epoch [{epoch + 1}/{n_epochs}]')
train_pbar.set_postfix({'loss': loss.detach().item()})
mean_train_loss = sum(loss_record) / len(loss_record)
writer.add_scalar('Loss/train', mean_train_loss, step)
model.eval() # Set your model to evaluation mode.
loss_record = []
for x, y in valid_loader:
x, y = x.to(device), y.to(device)
with torch.no_grad():
pred = model(x)
loss = criterion(pred, y)
loss_record.append(loss.item())
mean_valid_loss = sum(loss_record) / len(loss_record)
print(
f'Epoch [{epoch + 1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
writer.add_scalar('Loss/valid', mean_valid_loss, step)
if mean_valid_loss < best_loss:
best_loss = mean_valid_loss
torch.save(model.state_dict(), config['save_path']) # Save your best model
print('Saving model with loss {:.3f}...'.format(best_loss))
early_stop_count = 0
else:
early_stop_count += 1
if early_stop_count >= config['early_stop']:
print('\nModel is not improving, so we halt the training session.')
return
'''
y_pred = model(x_data)
loss = criterion(y_pred, y_data)
loss_list.append(loss.data)
print(loss.data.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
'''
# In[6]: split
def train_valid_split(data_set, valid_ratio, seed):
'''Split provided training data into training set and validation set'''
print(seed)
valid_set_size = int(valid_ratio * len(data_set))
train_set_size = len(data_set) - valid_set_size
train_set, valid_set = random_split(data_set, [train_set_size, valid_set_size], generator=torch.Generator().manual_seed(seed))
return np.array(train_set), np.array(valid_set)
# In[7]: go
same_seed = config['seed']
train_data = np.loadtxt('covid.train.csv', delimiter=',', dtype=np.float32,skiprows=1)
test_data = np.loadtxt('covid.test.csv', delimiter=',', dtype=np.float32, skiprows=1)
train_data, valid_data = train_valid_split(train_data,config['test_ratio'],config['seed'])
print(f"""train_data size: {train_data.shape}
valid_data size: {valid_data.shape}
test_data size: {test_data.shape}""")
x_train, x_valid, x_test, y_train, y_valid = select_feat(train_data, valid_data, test_data, config['select_all'])
train_dataset, valid_dataset, test_dataset = CovidDataSet(x_train, y_train), \
CovidDataSet(x_valid, y_valid), \
CovidDataSet(x_test)
train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['batch_size'], shuffle=True, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, pin_memory=True)
model = Model(input_dim=x_train.shape[1]).to(device)
trainer(train_loader,valid_loader,model,config,device)
y_test=model(torch.FloatTensor(test_data))
y_data=pd.DataFrame({'anser': y_test.tolist()})
y_data.to_csv('anser.csv',mode='a', header=True,index=None)
结果存入anser.csv
总结
- 随机数种子可以用来复现训练
- DataLoader可以分批处理数据
- 将数据分为训练集和测试集可以防止过拟合,将来的预测会更加准确
- 使用GPU来训练模型可以提高处理速度,有效减少内存的消耗,使用cpu此模型只能训练1000轮。
- 学习率要合适才能更好的收敛,过高会导致参数发散,损失值不下降。过低会导致学习效率缓慢,轮数过多。
- ReLu的学习效果要优于sigmoid
- 神经网络的设计是个玄学问题,层数多了有时候效果也不太好。