前言
课差不多结束了,趁期末考开始前继续入门计算机视觉比赛,作为菜鸟的我当然是从最简单的分类中最简单的练习赛开始了。实不相瞒,第一次的成绩就是11分hhhhhh. 今天讲讲一个最简单的11分的分类模型可以长什么样。
结构
数据是从AI研习社官网下载的,解压之后的文件分别是:
- test文件夹(存图片的)
- train文件夹(存图片的)
- train.csv(一列为图片名,另一列为label)
这个模型的总体结构如下:
- 自定义的数据集
- 一个分类网络模型
- 训练和预测函数
代码
MonketDataset.py
import torch.utils.data as Data
from PIL import Image
import numpy as np
import torch
import torchvision.transforms as transforms
import pandas as pd
# 读取训练数据和预测数据
def read_images(root=None, train=True):
if train is True: # 代表是训练数据
if root is None:
print('cannot find a root')
return None, None
image_list = pd.read_csv(root)
for i in range(image_list.shape[0]):
image_list.loc[i,'filename'] = './train/'+image_list.loc[i,'filename']
image_list.loc[i,'label'] = image_list.loc[i,'label']
data = image_list['filename'].to_list() # 感觉这边转list可能效率不太高
label = image_list['label'].to_list()
# print(data)
# print(label)
return data, label
else: # 代表是预测数据
data = []
for i in range(274): # test文件夹的照片数量是274,很直接(其实是我不知道有没有别的高端方法)
data.append('./test/'+str(i)+'.jpg')
# print(data)
return data, None
class MonketDataset(Data.Dataset):
def __init__(self, root, train, transforms):
self.transforms = transforms
self.data, self.label = read_images(root=root, train=train)
def __getitem__(self, index):
img = self.data[index]
img = Image.open(img).convert('RGB')
img = self.transforms(img)
if self.label is not None: # 代表是训练数据
label = self.label[index]
return img, label # 训练数据有image和label
return img # 测试数据只有image
def __len__(self):
return len(self.data)
# read_images(root='./train.csv',train=True)
# read_images(root=None,train=False)
ResNet.py
import torch
from torch import nn
from torch.nn import functional as F
class ResBlk(nn.Module):
"""
resnet block
"""
def __init__(self, ch_in, ch_out,stride=1): #需要知道维度大小
super(ResBlk, self).__init__()
self.conv1 = nn.Conv2d(ch_in, ch_out,kernel_size=3,stride=stride,padding=1)
self.bn1 = nn.BatchNorm2d(ch_out)
self.conv2 = nn.Conv2d(ch_out, ch_out, kernel_size=3, stride=1,padding=1)
self.bn2 = nn.BatchNorm2d(ch_out)
self.extra = nn.Sequential()
#把 [b, ch_in, h, w] 变成 [b, ch_out, h, w]
if ch_out != ch_in:
self.extra = nn.Sequential(
nn.Conv2d(ch_in, ch_out,kernel_size=1,stride=stride),
nn.BatchNorm2d(ch_out)
)
def forward(self, x):
# x: [b,c,h,w]
out = F.relu(self.bn1(self.conv1(x)))
out =self.bn2(self.conv2(out))
# short cut.
# extra的作用:把 [b, ch_in, h, w] 变成 [b, ch_out, h, w],否则无法相加
# element-wise add
# print('out.shape = ', out.shape, 'x.shape = ', x.shape)
out = self.extra(x) + out
return out
class ResNet18(nn.Module):
def __init__(self):
super(ResNet18, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=3,stride=3,padding=1),
nn.BatchNorm2d(64)
)
# followed by 4 blocks
# [b, 64, h, w] -> [b, 128, h, w]
self.blk1 = ResBlk(64,128, stride=2)
# [b, 128, h, w] -> [b, 256, h, w]
self.blk2 = ResBlk(128,256, stride=2)
# [b, 256, h, w] -> [b, 512, h, w]
self.blk3 = ResBlk(256,512, stride=2)
# [b, 512, h, w] -> [b, 512, h, w] (h和w也得变化,不然参数量要炸了,所以要在ResBlk里加个stride)
self.blk4 = ResBlk(512,512, stride=2)
self.outlayer = nn.Linear(512,10)
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.blk1(x)
x = self.blk2(x)
x = self.blk3(x)
x = self.blk4(x)
# print('After conv: ', x.shape)
# [b, 512, h, w] -> [b, 512, 1, 1]
x = F.adaptive_avg_pool2d(x, [1,1]) #为了用全连接层
# print('After pool', x.shape)
x = x.view(x.size(0),-1) #为了用全连接层
# print('After view: ', x.shape)
x = self.outlayer(x)
return x
# 计算ResBlk的out.shape
# tmp = torch.randn(2,64,32,32)
# blk = ResBlk(64,128, stride=2)
# out = blk(tmp)
# print('block: ', out.shape)
#
# x = torch.randn(2,3,32,32)
# model = ResNet18()
# out = model(x)
# print('resnet: ', out.shape)
# print(out)
main.py
import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from ResNet import ResNet18
import MonketDataset
from torch import nn, optim
import sys
import cv2
import os
import numpy as np
import pandas as pd
def auto_train_test():
batch_size = 32
input_size = 32
monkey_train = MonketDataset.MonketDataset(root='./train.csv', train=True, transforms=transforms.Compose([
transforms.Resize((input_size, input_size)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
]))
monkey_train = DataLoader(monkey_train, batch_size=batch_size, shuffle=True)
monkey_test = MonketDataset.MonketDataset(root=None, train=False, transforms=transforms.Compose([
transforms.Resize((input_size, input_size)),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
]))
monkey_test = DataLoader(monkey_test, batch_size=batch_size, shuffle=True)
x, label = iter(monkey_train).next()
print('x: ', x.shape, ' label: ', label.shape)
device = torch.device('cuda')
print(device)
model = ResNet18().to(device)
criteon = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
print(model)
for epoch in range(15):
model.train() #设置位train模式
for batchidx, (x, label) in enumerate(monkey_train):
# x [b,3,32,32]
# label [b]
# print(x,label)
x, label = x.to(device), label.to(device)
logits = model(x)
# logits: [b,10]
# label: [b]
# loss: tensor scalar
loss = criteon(logits, label)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(epoch, loss.item())
model.eval() # 设置为测试模式
total = np.array([],dtype=np.int)
with torch.no_grad():
# test
total_correct = 0
total_num = 0
for x in monkey_test:
x = x.to(device)
# [b, 10]
logits = model(x)
# [b]
pred = logits.argmax(dim=1) # 返回dim=1的那个格子是1的index
# [b] -> scalar tensor
pred = pred.cpu()
pred = pred.detach().numpy()
total = np.append(total, pred) # 把预测值都存到total里面
# print(total)
return total
if __name__ == '__main__':
result = auto_train_test()
df = pd.DataFrame(result)
df.to_csv('./result.csv',header=False)
分析
接下来提分的方向:
- 没有进行数据增强的操作,哪怕简单的数据增强应该都会对结果有积极影响
- 把多个分类模型拿来对比一下,看哪个效果好
- 肯定有未知的知识要补充
希望自己能快快变强!