科大讯飞动作识别算法赛初赛baseline

最新推荐文章于 2023-06-13 17:16:36 发布

深度之眼

最新推荐文章于 2023-06-13 17:16:36 发布

阅读量160

点赞数

分类专栏：粉丝的投稿深度学习干货人工智能干货文章标签：深度学习动作识别科大讯飞比赛

本文链接：https://blog.csdn.net/weixin_42645636/article/details/129358330

版权

深度学习干货同时被 3 个专栏收录

536 篇文章 202 订阅

订阅专栏

人工智能干货

509 篇文章 28 订阅

订阅专栏

粉丝的投稿

200 篇文章 2 订阅

订阅专栏

来源：投稿作者：LSC

编辑：学姐

最终: 0.76757分

比赛网址:

http://challenge.xfyun.cn/topic/info?type=action-recognition&ch=ds22-dw-zmt05

赛题任务:

带标注的训练数据，即视频中的每一帧都有动作标签；不带标注的测试数据。

作品介绍视频要求：视频数据按照数据来源存放在不同的文件夹中，视频文件采用H.264编码的mp4格式；标签文件对应视频文件放在同一文件夹下，标签文件采用txt格式，每一行标明帧号和本帧的人物动作label。

评价指标

模型预测结果采用准确率（accuarcy）进行评价，对于模型预测的结果，严格对比每一帧预测结果与真实标注的要素名和要素内容，若二者完全一致，则记为本帧识别正确。

对于一段测试视频计算准确率的方法为： accuracy=本段视频中完全预测正确的要素个数/本段视频的帧数。

对于一个模型计算准确率的方法为：accuracy=累加每段视频预测的准确率/总的测试视频个数。

赛题需要对视频的图像内容进行识别，因此可以考虑抽象为图像分类任务。完成赛题的步骤为：

(1)视频抽帧
(2)构建分类数据集
(3)训练分类模型
(4)对测试集进行预测

baseline代码是在恒源云平台上运行的

(1)训练集和测试集抽帧

由于赛题是按照帧标注的数据，因此我们抽帧可以直接选择所有的帧，并进行保存为图像。

import cv2, os, glob, codecs
if not os.path.exists('/hy-tmp/frames'):
    os.mkdir('/hy-tmp/frames')
    os.mkdir(os.path.join('/hy-tmp/frames', 'train'))
os.mkdir(os.path.join('/hy-tmp/frames', 'test'))


def extract_images(video_path, out_dir):
    video_name = os.path.basename(video_path).split('.')[0]
    cam = cv2.VideoCapture(video_path)
    print(video_path)
    frame_count = 1
    while True:
        successed, img = cam.read()
        if not successed:
            break
        outfile = f'./{out_dir}/{video_name}-{frame_count:06}.jpg'
        cv2.imwrite(outfile, img)
        frame_count += 1


paths = glob.glob('/hy-tmp/act_rec_data/train/*')
paths.sort()
train_video_path = [x for x in paths if not x.endswith('txt')]
train_ann_path = [x for x in paths if x.endswith('txt')]
paths = glob.glob('/hy-tmp/act_rec_data/test/*')
test_video_path = [x for x in paths if not x.endswith('txt')]


for path in train_video_path:
extract_images(path, os.path.join('/hy-tmp/frames', 'train'))


for path in test_video_path:
extract_images(path, os.path.join('/hy-tmp/frames', 'test'))

(2)构建分类数据集

!pip install tqdm


from torch.utils.data.dataset import Dataset
from torch.autograd import Variable
import torch.optim as optim
import torch.nn.functional as F
import torch.nn as nn
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import torchvision.models as models

import os
import sys
import glob
import pandas as pd
import numpy as np
from tqdm import tqdm

%pylab inline

import cv2
from PIL import Image

import torch
torch.manual_seed(0)  # 减少随机性
torch.backends.cudnn.deterministic = False  # 是否有确定性
torch.backends.cudnn.benchmark = True  # 自动寻找最适合当前配置的高效算法，提高运行效率

class XunFeiDataset(Dataset):
    def __init__(self, img_path, img_label, transform=None):
        self.img_path = img_path
        self.img_label = img_label
        if transform is not None:
            self.transform = transform
        else:
            self.transform = None

    def __getitem__(self, index):
        img = cv2.imread(self.img_path[index])
        img = img.astype(np.float32)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

        if self.transform is not None:
            img = self.transform(image=img)['image']
        return img, torch.from_numpy(np.array(self.img_label[index]))

    def __len__(self):
        return len(self.img_path)


train_img_list = []
train_label_list = []

for path in train_ann_path:
    basename = os.path.basename(path)[:-4]
    anns = codecs.open(path).readlines()
    for idx, ann in enumerate(anns):
        frame_count = idx + 1
        train_img_list.append(f'/hy-tmp/frames/train/{basename}-{frame_count:06}.jpg')
        if ',' not in ann:
            train_label_list.append(19)
        else:
            train_label_list.append(int(ann.split(',')[1]))


train_df = pd.DataFrame({
    'path': train_img_list,
    'label': train_label_list
})
train_df['label_int'], lbl = pd.factorize(train_df['label'])

# 筛选非空的图片
train_df = train_df[train_df['path'].apply(lambda x: cv2.imread(x) is not None)]


print(train_df.shape)
train_df = train_df.sample(frac=1.0)
train_df

(3)训练分类模型

我尝试了一下，resnet和efficientnet系列效果比较好，swin_transformer系列效果不太好而且模型太大保存不方便

model = models.efficientnet_b7(True)
model.classifier = nn.Sequential(
    nn.Dropout(p=0.5, inplace=True),
    nn.Linear(in_features=2560, out_features=14, bias=True)
)

!pip install 'albumentations'


import albumentations as A
from albumentations.pytorch import ToTensorV2

def train(train_loader, model, criterion, optimizer):
    model.train()
    train_loss = 0.0
    for i, (input, target) in enumerate(train_loader):
        input = input.cuda(non_blocking=True)
        target = target.cuda(non_blocking=True)

        # compute output
        output = model(input)
        loss = criterion(output, target)

        # compute gradient and do SGD step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    return train_loss/len(train_loader)


def validate(val_loader, model, criterion):
    model.eval()

    val_acc = 0.0

    with torch.no_grad():
        end = time.time()
        for i, (input, target) in enumerate(val_loader):
            input = input.cuda()
            target = target.cuda()

            # compute output
            output = model(input)
            loss = criterion(output, target)

            val_acc += (output.argmax(1) == target).sum().item()

    return val_acc / len(val_loader.dataset)


def predict(test_loader, model, criterion):
    model.eval()
    val_acc = 0.0

    test_pred = []
    with torch.no_grad():
        end = time.time()
        for i, (input, target) in enumerate(test_loader):
            input = input.cuda()
            target = target.cuda()

            # compute output
            output = model(input)
            test_pred.append(output.data.cpu().numpy())

return np.vstack(test_pred)

# 随机拆分
train_loader = torch.utils.data.DataLoader(
    XunFeiDataset(train_df['path'].values[:-20000], train_df['label_int'].values[:-20000],
                  A.Compose([
                      A.Resize(300, 300),
                      A.HorizontalFlip(p=0.5),
                      A.VerticalFlip(p=0.5),
                      A.RandomContrast(p=0.5),
                      A.RandomBrightness(p=0.5),
                      A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
                      ToTensorV2(),
                      ])

                  ), batch_size=10, shuffle=True, num_workers=4, pin_memory=False
)

val_loader = torch.utils.data.DataLoader(
    XunFeiDataset(train_df['path'].values[-2000:], train_df['label_int'].values[-2000:],
                  A.Compose([
                      A.Resize(300, 300),
                      A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
                      ToTensorV2(),
                      ])
                  ), batch_size=2, shuffle=False, num_workers=1, pin_memory=False
)

model = model.to('cuda')
criterion = nn.CrossEntropyLoss().cuda()  # 自带softmax
optimizer = torch.optim.SGD(model.parameters(), 0.005)
scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.001, max_lr=0.01, step_size_up=5, mode="triangular2")

best_acc = 0
for _ in range(15):
    train_loss = train(train_loader, model, criterion, optimizer)
    val_acc = validate(val_loader, model, criterion)

    if val_acc > best_acc:
        torch.save(model.state_dict(), 'model.pth')
        best_acc = val_acc

    scheduler.step()
print(train_loss, val_acc)

(4)对测试集进行预测

test_img_list = glob.glob(os.path.join('/hy-tmp/frames', 'test') + '/*')
test_img_list.sort()
test_img_list = pd.DataFrame(test_img_list)
test_img_list = test_img_list[test_img_list[0].apply(lambda x: cv2.imread(x) is not None)]


test_loader = torch.utils.data.DataLoader(
    XunFeiDataset(test_img_list[0].values, [0] * len(test_img_list),
          A.Compose([
              A.Resize(300, 300),
              A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
              ToTensorV2(),
              ])
          ), batch_size=10, shuffle=False, num_workers=1, pin_memory=False
)


test_pred = []
for data, _ in test_loader:
    pred = model(data.cuda())
test_pred += list(pred.argmax(1).cpu().numpy())


test_img_list['label'] = test_pred
test_img_list['video'] =test_img_list[0].apply(lambda x: os.path.basename(x).split('-')[0])
test_img_list['label'] = test_img_list['label'].apply(lambda x: lbl[x])


if os.path.exists('labels'):
    os.rmdir('labels')
os.mkdir('labels')


for path in test_video_path:
    cam = cv2.VideoCapture(path)
    length = int(cam.get(cv2.CAP_PROP_FRAME_COUNT))
    
    video_name = os.path.basename(path)[:-4]
    df = test_img_list[test_img_list['video'] == video_name]
    
    with open(os.path.join('labels', video_name + '.txt'), 'w') as up:
        for idx, row in enumerate(df.iterrows()):
            if row[1]['label'] == 19:
                up.write('{0}\n'.format(idx))
            else:
                up.write('{0},{1}\n'.format(idx, row[1]['label']))

        if idx < length-1:
            up.write('{0}\n'.format(idx))

# 压缩结果标签，下载到本地，然后提交
!zip labels.zip labels/ -r

点击下方卡片关注《学姐带你玩AI》🚀🚀🚀

回复“比赛”领取190+场比赛top方案，打包好了直接领

码字不易，欢迎大家点赞评论收藏！

深度之眼

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录