李宏毅老师的作业地址:作业地址
李宏毅老师的深度学习视频地址:yotubu链接
这里是李宏毅老师机器学习作业一的一些训练技巧。
作业一描述
学习完基础的深度学习知识,第一步进行regression,简单的对COVID疫情进行人数的预测。
基础的colab代码模板:链接
数据描述:
train.csv:2700行数据,第一列为ID,后面93列data,最后一列为target
test.csv:893行数据,第一列为ID,后面93列data,需要预测target
一、训练技巧
1.1 读取数据
对csv文件读取可以用两种方式,一种csv.reader或者readlines函数
with open(path, 'r') as f:
content = f.readlines()#将数据存在list里面,每个行是一个元素为str
data = []
for line in content:
line = line.strip().split(',')#strip去除\n,split以“,”分割,结果是list
data.append(line)
data = np.array(data[1:],dtype=float)[:,1:]#np可以对列进行操作
或者
with open(path, 'r') as f:
reader = csv.reader(f)
data = list(reader)
data = np.array(data[1:],dtype=float)[:,1:]
原因:data[1:]是将数据第一行去除,[:,1:]是将第一列id去除
1.2 特征选择
#f_regression
import pandas as pd
import numpy as np
data = pd.read_csv('covid.train.csv')
x = data[data.columns[1:94]]
y = data[data.columns[94]]
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
x = (x - x.min()) / (x.max() - x.min())
bestfeatures = SelectKBest(score_func=f_regression, k=5)
fit = bestfeatures.fit(x,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x.columns)
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
print(featureScores.nlargest(15,'Score')) #print 15 best features
#PCA降维
import numpy
import pandas as pd
from pandas import read_csv
from sklearn.decomposition import PCA
# load data
data = pd.read_csv('covid.train.csv')
x = data[data.columns[1:94]]
y = data[data.columns[94]]
pca = PCA(n_components=10)
fit = pca.fit_transform(x)
print(fit)
1、分类的时候用chi2比较多,一般去搜KBest的例子他们用的就是chi2
2、回归的时候用f_regression比较多K_best.fit_transform(X,Y) X,Y分别为训练值和预测值
1.3 Network定义
1、主要是需要Dropout,防止过拟合,使用BN,加速模型训练
self.net = nn.Sequential(
nn.Linear(input_dim, 32),
nn.BatchNorm1d(32),#使用BN,加速模型训练
nn.Dropout(p=0.2),#使用Dropout,减小过拟合,注意不能在BN之前
nn.LeakyReLU(),#更换激活函数
nn.Linear(32, 1)
)
2、正则化
#l1正则化
def loss_fn_l1(self, pred, target):
regularization_loss = 0
for param in model.parameters():
regularization_loss += torch.sum(abs(param))
return self.criterion(pred, target) + 0.00075 * regularization_loss
#l2正则化
def loss_fn_l2(self, pred, target):
regularization_loss = 0
for param in model.parameters():
regularization_loss += torch.sum(param ** 2)
return self.criterion(pred, target) + 0.00075 * regularization_loss
0.00075是参数值,等效于weight_delay,但是pytorch的weight_delay好像把b参数也正则化了。
1.4 Adam优化器
opt = getattr(torch.optim, 'Adam')(
model.parameters())
Adam可以自动调整速率,参数使用的是默认参数
1.5 训练过程(加入早停止)
early_stop_cnt = 0
min_mse = 1000
max_epoch = 10000
for epoch in range(max_epoch):
model.train()
train_loss = []
for x ,y in tr_dataloader:
opt.zero_grad()
pred = model(x)
loss = model.loss_fn(pred, y)
train_loss.append(loss.item())
loss.backward()
opt.step()
train_loss = np.mean(train_loss)
model.eval()
dev_loss = []
for x,y in de_dataloader:
with torch.no_grad():
pred = model(x)
loss = loss_fn(pred, y)
dev_loss.append(loss.item())
dev_loss = np.mean(dev_loss)
if dev_loss < min_mse:
min_mse = dev_loss
print(f"epoch: {epoch}, train_loss: {train_loss:.5f}, dev_loss: {dev_loss:.5f}")
early_stop_cnt = 0
torch.save(model, path)
else:
early_stop_cnt += 1
if early_stop_cnt > 500:
break
提早停止其实就是防止训练过度,在规定的轮数之内dev没有变小就停止。
1.6 test过程
model = torch.load(path)
model.eval()
with open('pred.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(["id","tested_positive"])
for idx, x in enumerate(te_dataloader):
with torch.no_grad():
pred = model(x)
writer.writerow([str(idx),str(pred.item())])
#print(idx,pred.item())
f.close()
重点是newline=‘’,不然会多一行空行。
二、代码总体
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import numpy as np
import csv
class COVID(Dataset):
def __init__(self, path, mode) -> None:
super().__init__()
self.mode = mode
'''
with open(path, 'r') as f:
content = f.readlines()
data = []
for line in content:
line = line.strip().split(',')
data.append(line)
data = np.array(data[1:],dtype=float)[:,1:]
'''
with open(path, 'r') as f:
reader = csv.reader(f)
data = list(reader)
data = np.array(data[1:],dtype=float)[:,1:]
feats = [75, 57, 42, 60, 78, 43, 61, 79, 40, 58, 76, 41, 59, 77, 92]
if self.mode == 'train':
idx = list(i for i in range(data.shape[0]) if i % 15 != 0 )
target = data[:,-1]
data = data[:,feats]
self.data = torch.FloatTensor(data[idx])
self.target = torch.FloatTensor(target[idx])
elif self.mode == 'dev':
idx = list(i for i in range(data.shape[0]) if i % 15 == 0 )
target = data[:,-1]
data = data[:,feats]
self.data = torch.FloatTensor(data[idx])
self.target = torch.FloatTensor(target[idx])
elif self.mode == 'test':
self.data = torch.FloatTensor(data[:,feats])
self.data[:, :] = \
(self.data[:, :] - self.data[:, :].mean(dim=0, keepdim=True)) \
/ self.data[:, :].std(dim=0, keepdim=True)
self.dim = self.data.shape[1]
def __getitem__(self, index):
if self.mode == 'train':
return self.data[index], self.target[index]
elif self.mode == 'dev':
return self.data[index], self.target[index]
elif self.mode == 'test':
return self.data[index]
def __len__(self):
return self.data.shape[0]
class Network(nn.Module):
def __init__(self, input_dim):
super().__init__()
self.net = nn.Sequential(
nn.Linear(input_dim, 32),
nn.BatchNorm1d(32),#使用BN,加速模型训练
nn.Dropout(p=0.2),#使用Dropout,减小过拟合,注意不能在BN之前
nn.LeakyReLU(),#更换激活函数
nn.Linear(32, 1)
)
self.criterion = nn.MSELoss(reduction='mean')
def forward(self,x):
return self.net(x).squeeze(1)
def loss_fn(self, pred, target):
regularization_loss = 0
for param in model.parameters():
regularization_loss += torch.sum(param ** 2)
return self.criterion(pred, target) + 0.00075 * regularization_loss
path = 'model/model.pth'
tr_path = 'covid.train.csv'
te_path = 'covid.test.csv'
tr_dataset = COVID(tr_path, 'train')
de_dataset = COVID(tr_path, 'dev')
te_dataset = COVID(te_path, 'test')
tr_dataloader = DataLoader(tr_dataset, batch_size= 200, shuffle= True)
de_dataloader = DataLoader(de_dataset, batch_size= 200, shuffle= True)
te_dataloader = DataLoader(te_dataset, batch_size= 1, shuffle= False)
#model = torch.load(path)
model = Network(tr_dataloader.dataset.dim)#.to('cuda')
opt = getattr(torch.optim, 'Adam')(
model.parameters())
#opt = torch.optim.Adam(model.parameters(),lr=0.0001,weight_decay=5e-4)
loss_fn = nn.MSELoss()
early_stop_cnt = 0
min_mse = 1000
max_epoch = 10000
for epoch in range(max_epoch):
model.train()
train_loss = []
for x ,y in tr_dataloader:
opt.zero_grad()
pred = model(x)
loss = model.loss_fn(pred, y)
train_loss.append(loss.item())
loss.backward()
opt.step()
train_loss = np.mean(train_loss)
model.eval()
dev_loss = []
for x,y in de_dataloader:
with torch.no_grad():
pred = model(x)
loss = loss_fn(pred, y)
dev_loss.append(loss.item())
dev_loss = np.mean(dev_loss)
if dev_loss < min_mse:
min_mse = dev_loss
print(f"epoch: {epoch}, train_loss: {train_loss:.5f}, dev_loss: {dev_loss:.5f}")
early_stop_cnt = 0
torch.save(model, path)
else:
early_stop_cnt += 1
if early_stop_cnt > 500:
break
'''
训练时:
1、squeeze不用训练不动
2、不batch normailzation训练不懂
'''
model = torch.load(path)
model.eval()
with open('pred.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(["id","tested_positive"])
for idx, x in enumerate(te_dataloader):
with torch.no_grad():
pred = model(x)
writer.writerow([str(idx),str(pred.item())])
#print(idx,pred.item())
f.close()
总结
对简单的网络训练技巧总结:
1、防止模型太大,简单的线性即可,主要dropout、正则化防止过拟合
2、加入早停止
3、对数据的进行特征选择