肺炎疫情攻防战–肺炎X光病灶识别 Pytorch baseline
刚从Keras转Pytorch没多久,一边看着文档一边Google完成这比赛的baseline。
比赛地址
比赛简介
本次由2019-nCoV病毒引发的肺炎疫情仍在持续,AI研习社重启肺炎赛题,希望大家尽量减少外出,安心在家打比赛拿奖金,勤洗手,多戴口罩。全国人民上下一心,共抗疫情。
数据集:训练集20013张,测试集 6671张。图片大小102410243
任务:训练模型正确识别肺炎X光病灶数量
环境
实验室断电了,淘宝租了2080ti,一天60块。T_T
jupyter lab+Python3.6+Pytorch 1.0.1
查看所给数据集
# 查看csv数据
import pandas as pd
from collections import Counter
csv=pd.read_csv("train.csv",header=None)
print(csv)
0 1
0 0 0
1 1 0
2 2 0
3 3 0
4 4 0
... ... ..
20008 20008 0
20009 20009 0
20010 20010 0
20011 20011 1
20012 20012 1
[20013 rows x 2 columns]
20013张图片,0列代表图片id,1列为标签
查看种类分布
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# 分布情况
cases_count = csv[1].value_counts()
print(cases_count)
# 画图
plt.figure(figsize=(6,3))
sns.barplot(x=cases_count.index, y= cases_count.values)
plt.title('Number of cases', fontsize=14)
plt.xlabel('Case type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(range(len(cases_count.index)), ['0', '1','2','3'])
plt.show()
0 15503
2 2450
1 1960
3 89
4 10
Name: 0.1, dtype: int64
这有数据集有点欺负人了,严重的类别不均衡,第一类占了75%,第三,四类可以放弃了。稍后可以扩充看看效果。没服务器跑的话直接全按0提交吧。(逃)
查看图片样例
导入图片查看肺炎ct长怎样,kaggle上学到的方法。
from PIL import Image
images=csv[1].values
labels=csv[0].values
paths="train/"
imgs0=(csv[csv[1]==0][0].iloc[:5]).tolist()
imgs1=(csv[csv[1]==1][0].iloc[:5]).tolist()
imgs2=(csv[csv[1]==2][0].iloc[:5]).tolist()
imgs3=(csv[csv[1]==3][0].iloc[:5]).tolist()
imgs4=(csv[csv[1]==4][0].iloc[:5]).tolist()
imgs=imgs0+imgs1+imgs2+imgs3+imgs4
f, ax = plt.subplots(5,5, figsize=(8,8))
for i in range(25):
img_path=paths+str(imgs[i])+'.jpg'
img=Image.open(img_path).convert('RGB')
ax[i//5, i%5].imshow(img)
ax[i//5, i%5].set_title(i//5)
ax[i//5, i%5].axis('off')
ax[i//5, i%5].set_aspect('auto')
plt.show()
思路
我觉得是个检测任务,检测病灶的数量。但我打算先当成多分类试试看。
自定义Dataset类
import pandas as pd
import numpy as np
import os
from PIL import Image
from torch.utils.data import Dataset
class MyDataSet(Dataset):
def __init__(self,image_path,csv_path,transforms,phrase):
# 读取csv
csv=pd.read_csv(csv_path,header=None)
# 读取第一列,组合成完整的图片地址
self.imgs=[image_path+str(k)+".jpg" for k in csv[0].values]
self.phrase=phrase
if self.phrase!="test":
self.labels=np.asarray([k for k in csv[1].values])
self.transforms=transforms
def __getitem__(self, index):
img_path = self.imgs[index]
pil_img = Image.open(img_path).convert("RGB")
if self.transforms:
data = self.transforms(pil_img)
else:
pil_img = np.asarray(pil_img)
data = torch.from_numpy(pil_img)
if self.phrase!="test":
label=self.labels[index]
sample = (data,label)
else:
sample=data
return sample
def __len__(self):
return len(self.imgs)
先做数据增强
训练集随机裁剪然后中心裁剪成224224,随机翻转,随机亮度对比度,随机旋转15度,标准化。验证集测试集只裁剪为224224。
from torchvision import transforms
# 数据增强
data_transforms={
"train":
transforms.Compose([
transforms.RandomResizedCrop(size=256, scale=(0.8, 1.0)),
transforms.CenterCrop(size=224),
transforms.RandomHorizontalFlip(),
transforms.ColorJitter(brightness=0.2, contrast=0.2),
transforms.RandomRotation(15),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
"val":
transforms.Compose([
transforms.RandomResizedCrop(size=256, scale=(0.8, 1.0)),
transforms.CenterCrop(size=224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
"test":
transforms.Compose([
transforms.RandomResizedCrop(size=256, scale=(0.8, 1.0)),
transforms.CenterCrop(size=224),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
]),
}
读取数据集,划分数据集
from torch.utils.data import DataLoader,SubsetRandomSampler
from load_data import MyDataSet
import numpy as np
batch_size=64
dataset=MyDataSet("train/","train.csv",data_transforms["train"],"train")
# 验证集比例
validation_split = .2
shuffle_dataset = True
random_seed= 42
batch_size=128
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
np.random.seed(random_seed)
np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]
# 创建sampler
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle_dataset,num_workers=4)
val_loader=DataLoader(dataset, batch_size=batch_size, shuffle=shuffle_dataset,num_workers=4)
建立模型
resnet34,imagenet预训练模型迁移学习
from torchvision import models
from torch import nn
import sys
# 冻结最初几层网络
def set_parameter_requires_grad(model, feature_extracting):
if feature_extracting:
for i, para in enumerate(model.parameters()):
if i < 50:
para.requires_grad = False
else:
para.requires_grad = True
def initialize_model(model_name, num_classes, feature_extract, use_pretrained=False):
model_ft = None
input_size = 0
model_ft = models.resnet34(pretrained=use_pretrained)
set_parameter_requires_grad(model_ft, feature_extract)
num_ftrs = model_ft.fc.in_features
model_ft.fc = nn.Sequential(nn.Linear(num_ftrs, num_classes))
input_size = 224
return model_ft, input_size
from model import initialize_model
num_classes=5
model_name="resnet34"
feature_extract=True
model,input_size=initialize_model(model_name=model_name,num_classes=num_classes,feature_extract=feature_extract, use_pretrained=True)
训练
参考官方文档的训练过程
import copy
import time
import torch
import matplotlib.pyplot as plt
def train_model(model, device,dataloaders, criterion, optimizer, num_epochs, is_inception=False):
since = time.time()
val_acc_history = []
train_acc_history=[]
val_loss_history = []
train_loss_history=[]
best_model_wts = copy.deepcopy(model.state_dict())
best_acc = 0.0
for epoch in range(num_epochs):
print('Epoch {}/{}'.format(epoch, num_epochs - 1))
print('-' * 10)
# Each epoch has a training and validation phase
for phase in ['train', 'val']:
if phase == 'train':
model.train() # Set model to training mode
else:
model.eval() # Set model to evaluate mode
running_loss = 0.0
running_corrects = 0
# Iterate over data.
for inputs, labels in dataloaders[phase]:
inputs = inputs.to(device)
labels = labels.to(device)
# zero the parameter gradients
optimizer.zero_grad()
# forward
# track history if only in train
with torch.set_grad_enabled(phase == 'train'):
outputs = model(inputs)
loss = criterion(outputs, labels)
_, preds = torch.max(outputs, 1)
# backward + optimize only if in training phase
if phase == 'train':
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item() * inputs.size(0)
running_corrects += torch.sum(preds == labels.data)
epoch_loss = running_loss / len(dataloaders[phase].dataset)
epoch_acc = running_corrects.double() / len(dataloaders[phase].dataset)
print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
# deep copy the model
if phase == 'val' and epoch_acc > best_acc:
best_acc = epoch_acc
best_model_wts = copy.deepcopy(model.state_dict())
if phase == 'val':
val_acc_history.append(epoch_acc)
val_loss_history.append(epoch_loss)
if phase=="train":
train_acc_history.append(epoch_acc)
train_loss_history.append(epoch_loss)
print()
his=[train_acc_history,val_acc_history,train_loss_history,val_loss_history]
time_elapsed = time.time() - since
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best val Acc: {:4f}'.format(best_acc))
# load best model weights
model.load_state_dict(best_model_wts)
torch.save(model,"model_{}.pkl".format(best_acc))
return model, his
训练20轮,损失函数交叉熵,优化adam,学习率0.001
from train import train_model
from torch import nn,optim
epoch=20
import torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss()
# optimizer=optim.SGD(params_to_update, lr=0.01, momentum=0.9)
optimizer = optim.Adam(params_to_update,lr=0.001,)
model,his=train_model(model, device,dataloader, criterion, optimizer, num_epochs=epoch, is_inception=False)
Epoch 0/19
----------
train Loss: 0.5685 Acc: 0.7956
val Loss: 0.5189 Acc: 0.8125
Epoch 1/19
----------
train Loss: 0.5186 Acc: 0.8106
val Loss: 0.5188 Acc: 0.8058
Epoch 2/19
----------
train Loss: 0.5061 Acc: 0.8116
val Loss: 0.4857 Acc: 0.8204
Epoch 3/19
----------
train Loss: 0.4996 Acc: 0.8152
val Loss: 0.4822 Acc: 0.8215
Epoch 4/19
----------
train Loss: 0.4967 Acc: 0.8183
val Loss: 0.4853 Acc: 0.8145
Epoch 5/19
----------
train Loss: 0.4891 Acc: 0.8187
val Loss: 0.4733 Acc: 0.8215
Epoch 6/19
----------
train Loss: 0.4824 Acc: 0.8197
val Loss: 0.5399 Acc: 0.7997
Epoch 7/19
----------
train Loss: 0.4837 Acc: 0.8209
val Loss: 0.4696 Acc: 0.8246
Epoch 8/19
----------
train Loss: 0.4783 Acc: 0.8210
val Loss: 0.4666 Acc: 0.8253
Epoch 9/19
----------
train Loss: 0.4755 Acc: 0.8232
val Loss: 0.4616 Acc: 0.8270
Epoch 10/19
----------
train Loss: 0.4711 Acc: 0.8227
val Loss: 0.4693 Acc: 0.8250
Epoch 11/19
----------
train Loss: 0.4683 Acc: 0.8231
val Loss: 0.4571 Acc: 0.8289
Epoch 12/19
----------
train Loss: 0.4634 Acc: 0.8247
val Loss: 0.4644 Acc: 0.8299
Epoch 13/19
----------
train Loss: 0.4613 Acc: 0.8286
val Loss: 0.4404 Acc: 0.8329
Epoch 14/19
----------
train Loss: 0.4582 Acc: 0.8281
val Loss: 0.4535 Acc: 0.8323
Epoch 15/19
----------
train Loss: 0.4556 Acc: 0.8265
val Loss: 0.4553 Acc: 0.8256
Epoch 16/19
----------
train Loss: 0.4508 Acc: 0.8305
val Loss: 0.4375 Acc: 0.8361
Epoch 17/19
----------
train Loss: 0.4453 Acc: 0.8316
val Loss: 0.4301 Acc: 0.8377
Epoch 18/19
----------
train Loss: 0.4473 Acc: 0.8323
val Loss: 0.4267 Acc: 0.8396
Epoch 19/19
----------
train Loss: 0.4391 Acc: 0.8330
val Loss: 0.4140 Acc: 0.8431
Training complete in 56m 33s
Best val Acc: 0.843094
画图
import numpy as np
train_acc = [h.cpu().numpy() for h in his[0]]
val_acc = [h.cpu().numpy() for h in his[1]]
plt.title("Accuracy vs Number of Training Epochs")
plt.xlabel("Training Epochs")
plt.ylabel("Accuracy")
plt.plot(range(1,epoch+1),train_acc,label="training")
plt.plot(range(1,epoch+1),val_acc,label="val")
plt.ylim((0,1.))
plt.xticks(np.arange(1, epoch+1, 1.0))
plt.legend()
plt.savefig("acc.png")
plt.show()
plt.close('all')
train_loss = [h for h in his[2]]
val_loss = [h for h in his[3]]
plt.title("Loss vs Number of Training Epochs")
plt.xlabel("Training Epochs")
plt.ylabel("Accuracy")
plt.plot(range(1,epoch+1),train_loss,label="training")
plt.plot(range(1,epoch+1),val_loss,label="val")
plt.xticks(np.arange(1, epoch+1, 1.0))
plt.legend()
plt.savefig("loss.png")
plt.show()
大部分情况下验证集准确率都大于训练集准确率,且loss也优于训练集,个人理解是训练集做了数据增强,而验证集测试集并没有处理,使得训练集数据分布变得更加广泛,变得难以训练,从而使模型变得泛化能力增强。在大型数据集中往往都会出现这种情况。
输出csv提交
手动新建了csv,序号从0到6670
import torch
from load_data import MyDataSet
from torch.utils.data import DataLoader
model=torch.load("model_0.84.pkl")
# 读取测试集
dataset=MyDataSet("test/","upload.csv",data_transforms["test"],"test")
testdata=DataLoader(dataset,batch_size=32, shuffle=False,num_workers=4)
print(dataset.__len__())
6671
# 输出学习了[Nin7a](https://blog.csdn.net/Nin7a/article/details/104057441)大佬的blog
import pandas as pd
model.eval()
predicted_labels_list=[]
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
for batch_id,(features) in enumerate(testdata):
features = features.to(device)
probas = model(features)
_, predicted_labels = torch.max(probas, 1)
for label in predicted_labels:
predicted_labels_list.append(label.cpu().item())
dict={'label':predicted_labels_list}
df=pd.DataFrame(dict)
df.to_csv('upload.csv',header=False,index=True)
提交
目前新榜单,暂时排名第十
小结
看了提交的csv,果然一个3和4也没有,全不是预测为0,1,2类,下一步可能会学习一下处理类别不均衡的问题。