本次组队学习基于计算机视觉实践(街景字符编码识别):
baseline报错和常见问题:
- 1
路径问题:报错类似于找不到文件或者下图这种
因为默认运行读取csv程序后,不一定是baseline的数据存放格式,如我的路径就是
所以应该把源代码中
train_path = glob.glob('../input/train/*.png')
train_path.sort()
train_json = json.load(open('../input/train.json'))
改为
train_path = glob.glob('./你的数据路径/train/*.png')
train_path.sort()
train_json = json.load(open('./你的数据路径/train.json'))
- 2
下载resnet18-5c106cde.pth过慢
如下图
可以直接进入网站https://download.pytorch.org/models/resnet18-5c106cde.pth用第三方下载器载pth文件,然后放到上面显示的路径里面,我的是C:\Users\wxb.cache\torch,即可跳过下载pth这一步。 - 3
多线程报错 The “freeze_support()” line can be omitted if the program
is not going to be frozen to produce an executable.
参考链接1,由于个人电脑原因,多线程可能不一定适合每个人,建议修改源代码
train_loader = torch.utils.data.DataLoader(
SVHNDataset(train_path, train_label,
transforms.Compose([
transforms.Resize((64, 128)),
transforms.RandomCrop((60, 120)),
transforms.ColorJitter(0.3, 0.3, 0.2),
transforms.RandomRotation(5),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])),
batch_size=40,
shuffle=True,
num_workers=10,
)
val_path = glob.glob('./dataset/mchar_val/*.png')
val_path.sort()
val_json = json.load(open('./dataset/mchar_val.json'))
val_label = [val_json[x]['label'] for x in val_json]
print(len(val_path), len(val_label))
val_loader = torch.utils.data.DataLoader(
SVHNDataset(val_path, val_label,
transforms.Compose([
transforms.Resize((60, 120)),
# transforms.ColorJitter(0.3, 0.3, 0.2),
# transforms.RandomRotation(5),
transforms.ToTensor(),
transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])),
batch_size=40,
shuffle=False,
num_workers=10,
)
将num_workers=10改为num_workers=0或者注释掉这一行。问题即可解决。
- 4
train()参数过多问题:报错TypeError: train() takes 4 positional arguments but 5 were given,如下图:
只需要将main函数里面
for epoch in range(2):
train_loss = train(train_loader, model, criterion, optimizer, epoch)
val_loss = validate(val_loader, model, criterion)
val_label = [''.join(map(str, x)) for x in val_loader.dataset.img_label]
val_predict_label = predict(val_loader, model, 1)
val_predict_label = np.vstack([
val_predict_label[:, :11].argmax(1),
val_predict_label[:, 11:22].argmax(1),
val_predict_label[:, 22:33].argmax(1),
val_predict_label[:, 33:44].argmax(1),
val_predict_label[:, 44:55].argmax(1),
]).T
val_label_pred = []
train(train_loader, model, criterion, optimizer, epoch)的最后一个参数删掉即可,最后改为:
for epoch in range(2):
train_loss = train(train_loader, model, criterion, optimizer)
val_loss = validate(val_loader, model, criterion)
val_label = [''.join(map(str, x)) for x in val_loader.dataset.img_label]
val_predict_label = predict(val_loader, model, 1)
val_predict_label = np.vstack([
val_predict_label[:, :11].argmax(1),
val_predict_label[:, 11:22].argmax(1),
val_predict_label[:, 22:33].argmax(1),
val_predict_label[:, 33:44].argmax(1),
val_predict_label[:, 44:55].argmax(1),
]).T
val_label_pred = []
PS:如果需要改变训练的epoch,修改这个if语句的range(2)为range(你需要训练的epoch数)即可。
- 5
数据类型错误:报错RuntimeError: Expected object of scalar type Long but got scalar type Int for argument #2 ‘target’ in call to _thnn_nll_loss_forward
需要把训练和预测里面的target转为long型,源代码:
def train(train_loader, model, criterion, optimizer):
# 切换模型为训练模式
model.train()
train_loss = []
for i, (input, target) in enumerate(train_loader):
if use_cuda:
input = input.cuda()
target = target.cuda()
c0, c1, c2, c3, c4 = model(input)
loss = criterion(c0, target[:, 0]) + \
criterion(c1, target[:, 1]) + \
criterion(c2, target[:, 2]) + \
criterion(c3, target[:, 3]) + \
criterion(c4, target[:, 4])
# loss /= 6
optimizer.zero_grad()
loss.backward()
optimizer.step()
if i % 100 == 0:
print(loss.item())
train_loss.append(loss.item())
return np.mean(train_loss)
def validate(val_loader, model, criterion):
# 切换模型为预测模型
model.eval()
val_loss = []
# 不记录模型梯度信息
with torch.no_grad():
for i, (input, target) in enumerate(val_loader):
if use_cuda:
input = input.cuda()
target = target.cuda()
c0, c1, c2, c3, c4 = model(input)
loss = criterion(c0, target[:, 0]) + \
criterion(c1, target[:, 1]) + \
criterion(c2, target[:, 2]) + \
criterion(c3, target[:, 3]) + \
criterion(c4, target[:, 4])
# loss /= 6
val_loss.append(loss.item())
return np.mean(val_loss)
改为
def train(train_loader, model, criterion, optimizer):
# 切换模型为训练模式
model.train()
train_loss = []
for i, (input, target) in enumerate(train_loader):
if use_cuda:
input = input.cuda()
target = target.cuda()
c0, c1, c2, c3, c4 = model(input)
loss = criterion(c0, target[:, 0].long()) + \
criterion(c1, target[:, 1].long()) + \
criterion(c2, target[:, 2].long()) + \
criterion(c3, target[:, 3].long()) + \
criterion(c4, target[:, 4].long())
# loss /= 6
optimizer.zero_grad()
loss.backward()
optimizer.step()
if i % 100 == 0:
print(loss.item())
train_loss.append(loss.item())
return np.mean(train_loss)
def validate(val_loader, model, criterion):
# 切换模型为预测模型
model.eval()
val_loss = []
# 不记录模型梯度信息
with torch.no_grad():
for i, (input, target) in enumerate(val_loader):
if use_cuda:
input = input.cuda()
target = target.cuda()
c0, c1, c2, c3, c4 = model(input)
loss = criterion(c0, target[:, 0].long()) + \
criterion(c1, target[:, 1].long()) + \
criterion(c2, target[:, 2].long()) + \
criterion(c3, target[:, 3].long()) + \
criterion(c4, target[:, 4].long())
# loss /= 6
val_loss.append(loss.item())
return np.mean(val_loss)
- 5
使用GPU训练时有的问题:
报错:TypeError: can’t convert CUDA tensor to numpy. Use Tensor.cpu() to copy the tensor to host memory first.
参考链接2
需要将c0.data.numpy()等改为c0.data.cpu().detach().numpy(),
修改后:
def predict(test_loader, model, tta=10):
model.eval()
test_pred_tta = None
# TTA 次数
for _ in range(tta):
test_pred = []
with torch.no_grad():
for i, (input, target) in enumerate(test_loader):
if use_cuda:
input = input.cuda()
c0, c1, c2, c3, c4 = model(input)
output = np.concatenate([
c0.data.cpu().detach().numpy(),
c1.data.cpu().detach().numpy(),
c2.data.cpu().detach().numpy(),
c3.data.cpu().detach().numpy(),
c4.data.cpu().detach().numpy()], axis=1)
test_pred.append(output)
test_pred = np.vstack(test_pred)
if test_pred_tta is None:
test_pred_tta = test_pred
else:
test_pred_tta += test_pred
return test_pred_tta
此时run程序,成功结束后会出现
使用baseline最后提交csv文件成绩如下:
接下来细细分析每一个API的功能:
Task1:
首先需要读取数据集,数据集标签是采用json文件编写,标签包括左上角坐标X、字符高度、左上角最表Y 、字符宽度、字符编码 五个参数,如下图所示:
读取代码为:
import json
train_json = json.load(open('./路径/train.json'))
赛题的目标是对街景图片中的字符进行识别并分类,所以第一步应该是字符识别。而对于这个任务,在目标检测的算法里面,有几个经典算法可以实现此目标,如:yolo、faster-rcnn、SSD等,后期考虑换模型以提高检测效率。