ML2021Spring HW-1

李宏毅老师的作业地址:作业地址
李宏毅老师的深度学习视频地址:yotubu链接
这里是李宏毅老师机器学习作业一的一些训练技巧。



作业一描述

学习完基础的深度学习知识,第一步进行regression,简单的对COVID疫情进行人数的预测。
基础的colab代码模板:链接

数据描述:
train.csv:2700行数据,第一列为ID,后面93列data,最后一列为target
test.csv:893行数据,第一列为ID,后面93列data,需要预测target


一、训练技巧

1.1 读取数据

对csv文件读取可以用两种方式,一种csv.reader或者readlines函数

with open(path, 'r') as f:
            content = f.readlines()#将数据存在list里面,每个行是一个元素为str
        data = []
        for line in content:
            line = line.strip().split(',')#strip去除\n,split以“,”分割,结果是list
            data.append(line)
        data = np.array(data[1:],dtype=float)[:,1:]#np可以对列进行操作

或者

with open(path, 'r') as f:
            reader = csv.reader(f)
            data = list(reader)
        data = np.array(data[1:],dtype=float)[:,1:]

原因:data[1:]是将数据第一行去除,[:,1:]是将第一列id去除

1.2 特征选择

参考链接

#f_regression
import pandas as pd
import numpy as np

data = pd.read_csv('covid.train.csv')
x = data[data.columns[1:94]]
y = data[data.columns[94]]

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

x = (x - x.min()) / (x.max() - x.min())

bestfeatures = SelectKBest(score_func=f_regression, k=5)
fit = bestfeatures.fit(x,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x.columns)
#concat two dataframes for better visualization 
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score']  #naming the dataframe columns
print(featureScores.nlargest(15,'Score'))  #print 15 best features
#PCA降维
import numpy
import pandas as pd
from pandas import read_csv
from sklearn.decomposition import PCA

# load data
data = pd.read_csv('covid.train.csv')
x = data[data.columns[1:94]]
y = data[data.columns[94]]

pca = PCA(n_components=10)
fit = pca.fit_transform(x)

print(fit)

1、分类的时候用chi2比较多,一般去搜KBest的例子他们用的就是chi2
2、回归的时候用f_regression比较多K_best.fit_transform(X,Y) X,Y分别为训练值和预测值

1.3 Network定义

1、主要是需要Dropout,防止过拟合,使用BN,加速模型训练

self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.BatchNorm1d(32),#使用BN,加速模型训练
            nn.Dropout(p=0.2),#使用Dropout,减小过拟合,注意不能在BN之前
            nn.LeakyReLU(),#更换激活函数
            nn.Linear(32, 1)
        )

2、正则化

#l1正则化
    def loss_fn_l1(self, pred, target):
        regularization_loss = 0
        for param in model.parameters():
            regularization_loss += torch.sum(abs(param))
        return self.criterion(pred, target) + 0.00075 * regularization_loss

    #l2正则化
    def loss_fn_l2(self, pred, target):
        regularization_loss = 0
        for param in model.parameters():
            regularization_loss += torch.sum(param ** 2)
        return self.criterion(pred, target) + 0.00075 * regularization_loss

0.00075是参数值,等效于weight_delay,但是pytorch的weight_delay好像把b参数也正则化了。

1.4 Adam优化器

opt = getattr(torch.optim, 'Adam')(
    model.parameters())

Adam可以自动调整速率,参数使用的是默认参数

1.5 训练过程(加入早停止)

early_stop_cnt = 0
min_mse = 1000
max_epoch = 10000
for epoch in range(max_epoch):
    model.train()
    train_loss = []
    for x ,y in tr_dataloader:
        opt.zero_grad()

        pred = model(x)
        loss = model.loss_fn(pred, y)
        train_loss.append(loss.item())

        loss.backward()
        opt.step()
    train_loss = np.mean(train_loss)
    

    model.eval()
    dev_loss = []
    for x,y in de_dataloader:
        with torch.no_grad():
            pred = model(x)
            loss = loss_fn(pred, y)
            dev_loss.append(loss.item())
    dev_loss = np.mean(dev_loss)

    if dev_loss < min_mse:
        min_mse = dev_loss
        print(f"epoch: {epoch},    train_loss: {train_loss:.5f},    dev_loss: {dev_loss:.5f}")
        early_stop_cnt = 0
        torch.save(model, path)
    else:
        early_stop_cnt += 1

    if early_stop_cnt > 500:
        break

提早停止其实就是防止训练过度,在规定的轮数之内dev没有变小就停止。

1.6 test过程

model = torch.load(path)
model.eval()
with open('pred.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["id","tested_positive"])
    for idx, x in enumerate(te_dataloader):
        with torch.no_grad():
            pred = model(x)
            writer.writerow([str(idx),str(pred.item())])

            #print(idx,pred.item())
    f.close()   

重点是newline=‘’,不然会多一行空行。

二、代码总体

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import numpy as np
import csv

class COVID(Dataset):
    def __init__(self, path, mode) -> None:
        super().__init__()
        self.mode = mode
        '''
        with open(path, 'r') as f:
            content = f.readlines()
        data = []
        for line in content:
            line = line.strip().split(',')
            data.append(line)
        data = np.array(data[1:],dtype=float)[:,1:]
        '''
        
        with open(path, 'r') as f:
            reader = csv.reader(f)
            data = list(reader)
        data = np.array(data[1:],dtype=float)[:,1:]
        
        feats = [75, 57, 42, 60, 78, 43, 61, 79, 40, 58, 76, 41, 59, 77, 92]
        if self.mode == 'train':
            idx = list(i for i in range(data.shape[0]) if i % 15 != 0 )
            target = data[:,-1]
            data = data[:,feats]
            self.data = torch.FloatTensor(data[idx])
            self.target = torch.FloatTensor(target[idx])
        
        elif self.mode == 'dev':
            idx = list(i for i in range(data.shape[0]) if i % 15 == 0 )
            target = data[:,-1]
            data = data[:,feats]
            self.data = torch.FloatTensor(data[idx])
            self.target = torch.FloatTensor(target[idx])

        elif self.mode == 'test':
            self.data = torch.FloatTensor(data[:,feats])

        self.data[:, :] = \
            (self.data[:, :] - self.data[:, :].mean(dim=0, keepdim=True)) \
            / self.data[:, :].std(dim=0, keepdim=True)
        
        self.dim = self.data.shape[1]

    def __getitem__(self, index):
        if self.mode == 'train':
            return self.data[index], self.target[index]
        elif self.mode == 'dev':
            return self.data[index], self.target[index]
        elif self.mode == 'test':
            return self.data[index] 

    def __len__(self):
        return self.data.shape[0]

class Network(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, 32),
            nn.BatchNorm1d(32),#使用BN,加速模型训练
            nn.Dropout(p=0.2),#使用Dropout,减小过拟合,注意不能在BN之前
            nn.LeakyReLU(),#更换激活函数
            nn.Linear(32, 1)
        )

        self.criterion = nn.MSELoss(reduction='mean')
    def forward(self,x):
        return self.net(x).squeeze(1)
    
    def loss_fn(self, pred, target):
        regularization_loss = 0
        for param in model.parameters():
            regularization_loss += torch.sum(param ** 2)
        return self.criterion(pred, target) + 0.00075 * regularization_loss

path = 'model/model.pth'
tr_path = 'covid.train.csv'
te_path = 'covid.test.csv'

tr_dataset = COVID(tr_path, 'train')
de_dataset = COVID(tr_path, 'dev')
te_dataset = COVID(te_path, 'test')
tr_dataloader = DataLoader(tr_dataset, batch_size= 200, shuffle= True)
de_dataloader = DataLoader(de_dataset, batch_size= 200, shuffle= True)
te_dataloader = DataLoader(te_dataset, batch_size= 1, shuffle= False)

#model = torch.load(path)
model = Network(tr_dataloader.dataset.dim)#.to('cuda')
opt = getattr(torch.optim, 'Adam')(
    model.parameters())
#opt = torch.optim.Adam(model.parameters(),lr=0.0001,weight_decay=5e-4)
loss_fn = nn.MSELoss()


early_stop_cnt = 0
min_mse = 1000
max_epoch = 10000
for epoch in range(max_epoch):
    model.train()
    train_loss = []
    for x ,y in tr_dataloader:
        opt.zero_grad()

        pred = model(x)
        loss = model.loss_fn(pred, y)
        train_loss.append(loss.item())

        loss.backward()
        opt.step()
    train_loss = np.mean(train_loss)
    

    model.eval()
    dev_loss = []
    for x,y in de_dataloader:
        with torch.no_grad():
            pred = model(x)
            loss = loss_fn(pred, y)
            dev_loss.append(loss.item())
    dev_loss = np.mean(dev_loss)

    if dev_loss < min_mse:
        min_mse = dev_loss
        print(f"epoch: {epoch},    train_loss: {train_loss:.5f},    dev_loss: {dev_loss:.5f}")
        early_stop_cnt = 0
        torch.save(model, path)
    else:
        early_stop_cnt += 1

    if early_stop_cnt > 500:
        break




'''
训练时:
1、squeeze不用训练不动
2、不batch normailzation训练不懂
'''
model = torch.load(path)
model.eval()
with open('pred.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(["id","tested_positive"])
    for idx, x in enumerate(te_dataloader):
        with torch.no_grad():
            pred = model(x)
            writer.writerow([str(idx),str(pred.item())])

            #print(idx,pred.item())
    f.close()   

总结

对简单的网络训练技巧总结:
1、防止模型太大,简单的线性即可,主要dropout、正则化防止过拟合
2、加入早停止
3、对数据的进行特征选择

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值