李宏毅机器学习2022
用神经网络做回归预测新冠病例数
参考链接
# Numerical Operations
import math
import numpy as np
# Reading/Writing Data
import pandas as pd
import os
import csv
# For Progress Bar
from tqdm import tqdm
# Pytorch
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
# For plotting learning curve
from torch.utils.tensorboard import SummaryWriter
device = "cuda" if torch.cuda.is_available() else "cpu"
数据集
covid.train.csv 没有y
random_split划分训练集和验证集
对标注数据做x,y
convid19dataset(重写DataSet)封装
DataLoader迭代
import pandas as pd
train=pd.read_csv("covid.train.csv").values
train.shape
test_data=pd.read_csv("covid.test.csv").values
test_data.shape
def train_valid_split(data,valid_ratio):
valid_len=int(valid_ratio*len(data))
print(valid_len)
train_len=len(data)-valid_len
train_data,valid_data= random_split(data,[train_len,valid_len], generator=torch.Generator().manual_seed(0))
print((train_data))
return np.array(train_data),np.array(valid_data)
def select_feat(train_data,valid_data,test_data,select_all=True):
y_train,y_valid = train_data[:,-1],valid_data[:,-1]
raw_train_data,raw_valid_data,raw_test_data=train_data[:,:-1],valid_data[:,:-1],test_data[:,:]
if select_all==True:
fea_idx=list(range(raw_train_data.shape[1]))
else:
fea_idx=[0,1,2,3,4]
return raw_train_data[:,fea_idx],raw_valid_data[:,fea_idx],raw_test_data[:,fea_idx],y_train,y_valid
train_data,valid_data=train_valid_split(train,0.2)
## 最后train_data.shape 没有值是因为传入的dataset 是dataframe --> subset--> narray
raw_train_data,raw_valid_data,raw_test_data,y_train,y_valid=select_feat(train_data,valid_data,test_data)
# class self
class covid19dataset(Dataset):
def __init__(self,x,y=None):
if y is not None:
self.x=torch.FloatTensor(x)
self.y=torch.FloatTensor(y)
else:
self.x=torch.FloatTensor(x)
def __getitem__(self,idx):
if self.y is not None:
return self.x[idx],self.y[idx]
else :
return self.x[idx]
def __len__(self):
return len(self.x)
train_data=covid19dataset(raw_train_data,y_train)# train_datar 遍历unpack
valid_data=covid19dataset(raw_valid_data,y_valid)
test_data=covid19dataset(raw_test_data)
train_loader=DataLoader(train_data,batch_size=256,shuffle=True)
valid_loader=DataLoader(valid_data,batch_size=256,shuffle=True)
test_loader=DataLoader(test_data,batch_size=256,shuffle=False)
## 定义模型
class MyModel(nn.Module):
def __init__(self,input_dim):
super(MyModel,self).__init__()
self.layers=nn.Sequential(
nn.Linear(input_dim,16),
nn.ReLU(),
nn.Linear(16,8),
nn.ReLU(),
nn.Linear(8,1),
nn.ReLU()
)
def forward(self,x):
y = self.layers(x)
return y.squeeze(1)
训练模型
模型
决策:均方损失函数 dim=1
算法:adam 梯度优化、在valid上做选择
GPU:
model.to(device)
X,Yto(device)
输入到模型的变量:FloatTensor()
与模型相关的变量取值要detach()
取值item()
len(raw_train_data)
raw_train_data.shape[1]
model=MyModel(raw_train_data.shape[1])
model.to(device)
def trainer(train_loader,valid_loader,model,epochs=3000):
criteron=torch.nn.MSELoss(reduction='mean')
optim=torch.optim.SGD(model.parameters(),lr=1e-5)
step=0
# 训练过程可视化器
writer = SummaryWriter() # Writer of tensoboard.
if os.path.isdir('.\\model'):
pass
else:
os.mkdir(".\\model")
for epoch in range(epochs):
model.train()
loss_record=[]
train_pbar=tqdm(train_loader,position=0)
for x,y in train_pbar:
x=x.to(device)
y=y.to(device)
optim.zero_grad()
preds=model(x)
# y dim = 1
loss=criteron(preds,y)
loss_record.append(loss.detach().item())
loss.backward()
optim.step()
step+=1
train_pbar.set_description(f'Epoch [{epoch+1}/{epochs}]')
train_pbar.set_postfix({'loss': loss.detach().item()})
mean_train_loss=sum(loss_record)/len(loss_record)
writer.add_scalar('losstrain',mean_train_loss,step)
model.eval()
loss_record=[]
for x,y in valid_loader:
x=x.to(device)
y=y.to(device)
with torch.no_grad():
preds=model(x)
loss=criteron(preds,y)
loss_record.append(loss.item())
mean_valid_loss=sum(loss_record)/len(loss_record)
writer.add_scalar('lossvalid',mean_valid_loss,epoch)
print(f'Epoch [{epoch + 1}/{epochs}]: Train loss: {mean_train_loss:.4f}, Valid loss: {mean_valid_loss:.4f}')
best_loss = math.inf
if mean_valid_loss<best_loss:
best_loss=mean_valid_loss
torch.save(model.state_dict(),'./models/model.ckpt')
print(f"new model{best_loss}")
early_stop_count = 0
else:
early_stop_count += 1
if early_stop_count >= 20:
print('\nModel is not improving, so we halt the training session.')
return
trainer(train_loader,valid_loader,model)
可视化
get_ipython().run_line_magic('reload_ext', 'tensorboard')
get_ipython().run_line_magic('tensorboard', '--logdir=./runs/')
预测
def predict(x,model)
model.eval()
y_record=[]
for x in test_loader:
x.to(device)
with torch.no_grad():
y=model(x)
y_record.append(y)
return y_record=torch.cat(y_record,dim=0)