比赛地址:Digit Recognizer | Kaggle
数据集下载地址:链接: https://pan.baidu.com/s/1sOhknWv0bP-loqczapih4w 提取码: urad
数据集KaggleAPI:kaggle competitions download -c digit-recognizer
数据集中包含三个文件:
1.train.csv
文件包含42000行和785列,其中第一列为标签列,其余784列为像素值,实际为28*28的图像。
2.test.csv
文件包含28000行和784列,相较于train.csv少了第一列标签列,其余784列同样为28*28的像素。
3.sample_submission.csv
本文件提供一个参考提交公式, 第二列标签列值全为0。提交时输出的结果应和本文件格式一致,第一列为ImageId,第二列为预测的Label。
了解了比赛的数据集后,接下来开始对数据进行处理
首先导入需要使用的模块
import torch
import torch.nn as nn
from torch.nn import functional as F
from resnest.torch import resnest50
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torchvision import transforms
from torch.optim.lr_scheduler import CosineAnnealingLR
from sklearn.model_selection import KFold
import cv2
import os
import matplotlib.pyplot as plt
import torchvision.models as models
from tqdm import tqdm
接下来正式开始数据分析与处理步骤
1.数据集的读取与格式处理
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
#将标签列单独提取出来,方便后面预测时使用
labels = np.array(train_df.pop('label'))
#将训练集图片提取出来,同时reshape为1*28*28的单通道图片
train_imgs = np.array(train_df)
train_imgs = train_imgs.reshape((42000, 1, 28, 28))
#测试集同训练集
test_imgs = np.array(test_df)
test_imgs = test_imgs.reshape((28000, 1, 28, 28))
2.重写Dataset
#读取训练数据
class TrainData(Dataset):
def __init__(self, img, label):
self.img = img
self.label = label
self.len = len(label)
def __getitem__(self, index):
cur_img = self.img[index]
cur_label = self.label[index]
return cur_img, cur_label
def __len__(self):
return self.len
train_data = TrainData(train_imgs, labels)
#读取测试数据
class TestData(Dataset):
def __init__(self, img):
self.img = img
self.len = len(img)
def __getitem__(self, index):
cur_img = self.img[index]
return cur_img
def __len__(self):
return self.len
test_data = TestData(test_imgs)
3.设置训练模型
from resnest.torch import resnest50
def set_parameter_requires_grad(model, feature_extracting):
if feature_extracting:
model = model
for param in model.parameters():
param.requires_grad = False
def resnest_model(num_classes, feature_extract = False):
model_ft = resnest50(pretrained=True)
#原resnest网络中第一层输入通道数为3,但MNIST数据集为单通道数据集,故将第一层输入通道数改为1
model_ft.conv1[0] = nn.Conv2d(1, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Sequential(nn.Linear(num_ftrs, num_classes))
return model_ft
4.设置超参数
k_folds = 5
num_epochs = 30
learning_rate = 1e-4
weight_decay = 1e-3
train_loss_function = nn.CrossEntropyLoss()
valid_loss_function = nn.CrossEntropyLoss()
results = {}
# 使训练结果可复现
torch.manual_seed(42)
# 设置K折交叉验证
kfold = KFold(n_splits=k_folds, shuffle=True)
5.开始训练
for fold, (train_ids,valid_ids) in enumerate(kfold.split(train_data)):
save_path = f'./model-fold-{fold}.pth'
train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
trainloader = torch.utils.data.DataLoader(train_data, batch_size=32, sampler = train_subsampler, num_workers=0)
valid_subsampler = torch.utils.data.SubsetRandomSampler(valid_ids)
validloader = torch.utils.data.DataLoader(train_data, batch_size=32, sampler = valid_subsampler, num_workers=0)
model = resnest_model(10)
model = model.to(device)
model.device = device
optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate,weight_decay= weight_decay)
scheduler = CosineAnnealingLR(optimizer,T_max=10)
for epoch in range(0,num_epochs):
model.train()
# Print epoch
print(f'Starting epoch {epoch+1}')
# These are used to record information in training
train_losses = []
train_accs = []
# Iterate the training set by batches
for batch in tqdm(trainloader):
# Move images and labels to GPU
imgs, labels = batch
imgs = imgs.to(device,dtype = torch.float32)
labels = labels.to(device,dtype = torch.float32)
# Forward the data
logits = model(imgs)
# Calculate loss
loss = train_loss_function(logits,labels.long())
# Clear gradients in previous step
optimizer.zero_grad()
# Compute gradients for parameters
loss.backward()
# Update the parameters with computed gradients
optimizer.step()
train_losses.append(loss.item())
print("第%d个epoch的学习率:%f" % (epoch+1,optimizer.param_groups[0]['lr']))
scheduler.step()
train_loss = np.sum(train_losses) / len(train_losses)
print(f"[ Train | {epoch + 1:03d}/{num_epochs:03d} ] loss = {train_loss:.5f}")
save_path = f'./model-fold-{fold}.pth'
torch.save(model.state_dict(),save_path)
model.eval()
valid_losses = []
valid_accs = []
with torch.no_grad():
for batch in tqdm(validloader):
imgs, labels = batch
# No gradient in validation
logits = model(imgs.to(device, dtype = torch.float32))
loss = valid_loss_function(logits,labels.to(device).long())
acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()
loss = loss.to('cpu')
acc = acc.to('cpu')
# Record loss and accuracy
valid_losses.append(loss.item())
valid_accs.append(acc)
# The average loss and accuracy
valid_loss = np.sum(valid_losses)/len(valid_losses)
valid_acc = np.sum(valid_accs)/len(valid_accs)
print(f"[ Valid | {epoch + 1:03d}/{num_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")
print('Accuracy for fold %d: %d' % (fold, valid_acc))
print('--------------------------------------')
results[fold] = valid_acc
# Print fold results
print(f'K-FOLD CROSS VALIDATION RESULTS FOR {k_folds} FOLDS')
print('--------------------------------')
total_summation = 0.0
for key, value in results.items():
print(f'Fold {key}: {value} ')
total_summation += value
print(f'Average: {total_summation/len(results.items())} ')
5次交叉验证训练后所得结果:
可见训练精度大致在99.3%左右
6.进行测试集的测试
testloader = torch.utils.data.DataLoader(
test_data,
batch_size=32, num_workers=0)
model = resnest_model(10)
model = model.to(device)
for test_fold in range(k_folds):
model_path = f'./model-fold-{test_fold}.pth'
saveFileName = f'./submission-fold-{test_fold}.csv'
model.load_state_dict(torch.load(model_path))
model.eval()
# Initialize a list to store the predictions.
predictions = []
# Iterate the testing set by batches.
for batch in tqdm(testloader):
imgs = batch
with torch.no_grad():
logits = model(imgs.to(device,dtype = torch.float32))
# Take the class with greatest logit as prediction and record it.
predictions.extend(logits.argmax(dim=-1).cpu().numpy().tolist())
preds = []
for prediction in predictions:
preds.append(prediction)
test_data = pd.read_csv('./sample_submission.csv')
test_data['Label'] = pd.Series(preds)
submission = pd.concat([test_data['ImageId'], test_data['Label']], axis=1)
submission.to_csv(saveFileName, index=False)
print("Finish current work!")
7.进行五次交叉验证所得结果的融合工作
df_all = pd.read_csv('./sample_submission.csv')
df_id = df_all.drop(['Label'], axis = 1)
df_s = df_id.copy()
df_s = df_id.drop(['ImageId'], axis = 1)
for i in range(0, k_folds):
df = pd.read_csv(f'./submission-fold-{i}.csv')
df_s[i] = df['Label']
df_s.head()
所得的表格:
对每一行的结果求众数:
result = df_s.mode(axis = 1)
result = result.iloc[:, 0]
df_s['result'] = result.astype('uint8')
df_id['Label'] = df_s['result']
df_id.head()
所得结果:
8.结果保存
submission = df_id.to_csv('./result.csv', index = False)
print("Finish all the works!")
至此,我们完成了对于MNIST数据集的全部工作,此时将result.csv文件提交至kaggle竞赛即可完成本次竞赛。
9.竞赛结果
可以看到,在使用ResNest进行迁移学习后,我们取得了0.99546的预测精度。
10.后续精度提高
对于本次竞赛,可以考虑以下几种措施进一步提升预测精度:
(1):使用多个神经网络进行预测,取多个模型预测的众数
(2):更换超参数,如增大或减小初始学习率,Adam优化器参数调整
(3):使用不同的resnest随机种子,选取预测精度高的种子。