行人重识别数据集-统一为market1501数据集进行多数据集联合训练

Good@dz

已于 2023-12-25 16:29:38 修改

阅读量1.9k

点赞数 18

分类专栏：重识别文章标签：数据集

于 2023-12-25 11:19:24 首次发布

本文链接：https://blog.csdn.net/qq_42178122/article/details/135193476

版权

重识别专栏收录该内容

5 篇文章 0 订阅

订阅专栏

一、前言

常用的数据集：
在这里插入图片描述数据集下载链接：https://kaiyangzhou.github.io/deep-person-reid/datasets.html
https://kaiyangzhou.github.io/deep-person-reid/datasets.html#sensereid-sensereid

二、数据集合并

第一步：market1501的数据集文件夹格式的创建

market1501的图片命名信息，以图片 0012_c4s1_000826_01.jpg 对数据集命名进行说明

0012 是行人 ID，Market 1501 有 1501 个行人，故行人 ID 范围为 0001-1501
c4 是摄像头编号(camera 4)，表明图片采集自第4个摄像头，一共有 6 个摄像头
s1 是视频的第一个片段(sequece1)，一个视频包含若干个片段
000826 是视频的第 826 帧图片，表明行人出现在该帧图片中
01 代表第 826 帧图片上的第一个检测框，DPM 检测器可能在一帧图片上生成多个检测框

DPM 检测器是DPM 是一种基于部件的模型，它将目标（如行人）视为多个部分的组合，这些部分可以有不同的形状和大小，并且它们之间的相对位置可以变形。例如在行人检测中，部件可能包括头部、手臂、躯干、腿等。这些部件被建模为滤波器，用于在图像中搜索与之对应的特征。

数据集的文件格式分析
下载好的 Market 1501 包括以下几个文件夹：

bounding_box_test 是测试集，包括 19732 张图片。gallery 是通过 DPM 检测器生成的。
bounding_box_train 是训练集，包括 12936 张图片。
query 是待查找的图片集，在 bounding_box_test 中实现查找。这些图片是手动绘制生成的。
gt_bbox 是手工标注的训练集和测试集图片，包括 25259 张图片，用来区分 “good” “junk” 和 “distractors” 图片。（基本弃用）
gt_query 是一些 Matlab 格式的文件，里面记录了 “good” 和 “junk” 图片的索引，主要被用来评估模型。（基本弃用）

因此，我们只需要创建几个文件夹-bounding_box_test 、bounding_box_train和query。使用的代码如下：

import os
def make_market_dir(dst_dir='./'):
    market_root = os.path.join(dst_dir, 'market1501')
    train_path = os.path.join(market_root, 'bounding_box_train')
    query_path = os.path.join(market_root, 'query')
    test_path = os.path.join(market_root, 'bounding_box_test')
 
    if not os.path.exists(train_path):
        os.makedirs(train_path)
    if not os.path.exists(query_path):
        os.makedirs(query_path)
    if not os.path.exists(test_path):
        os.makedirs(test_path)
if __name__ == '__main__':
   make_market_dir(dst_dir='./reID')

第二步：market1501数据集抽取

链接：https://pan.baidu.com/s/1Yf-Smagh1SOZzmhl7agzjQ
提取码：8741
将整个market1501数据集作为训练集，抽取的结果一共有 29419 张图片， ID从0001到1501一共1501 个不同ID的行人。

import re
import os
import shutil
 
def extract_market(src_path, dst_dir):
    img_names = os.listdir(src_path)
    pattern = re.compile(r'([-\d]+)_c(\d)')
    pid_container = set()
    for img_name in img_names:
        if '.jpg' not in img_name:
            continue
        print(img_name)
        # pid: 每个人的标签编号 1
        # _  : 摄像头号 2
        pid, _ = map(int, pattern.search(img_name).groups())
        # 去掉没用的图片
        if pid == 0 or pid == -1:
            continue
        shutil.copy(os.path.join(src_path, img_name), os.path.join(dst_dir, img_name))
 
if __name__ == '__main__':
    src_train_path = './Market-1501-v15.09.15/bounding_box_train'
    src_query_path = './Market-1501-v15.09.15/query'
    src_test_path = './Market-1501-v15.09.15/bounding_box_test'
    # 将整个market1501数据集作为训练集
    dst_dir = './reID/market1501/bounding_box_train'
 
    extract_market(src_train_path, dst_dir)
    extract_market(src_query_path, dst_dir)
    extract_market(src_test_path, dst_dir)

第三步：CUHK数据集抽取

链接：https://pan.baidu.com/s/1y74mhK0PkIPBscHUxh-uGA
提取码：xvbc
CUHK03一共有 14097 张图片， ID从001502到002968一共1467个不同ID的行人

import glob
import re
import os.path as osp
import shutil
 
import re
import os
import shutil
 
def extract_cuhk03(src_path, dst_dir):
    img_names = os.listdir(src_path)
    pattern = re.compile(r'([-\d]+)_c(\d)_([\d]+)')
    pid_container = set()
    for img_name in img_names:
        if '.png' not in img_name and '.jpg' not in img_name:
            continue
        print(img_name)
        # pid: 每个人的标签编号 1
        # camid  : 摄像头号 2
        pid, camid, fname = map(int, pattern.search(img_name).groups())
        # 这里注意需要加上前面的market1501数据集的最后一个ID 1501
        # 在前面数据集的最后那个ID基础上继续往后排
        pid += 1501
        dst_img_name = str(pid).zfill(6) + '_c' + str(camid) + '_CUHK' + str(fname) + '.jpg'
        shutil.copy(os.path.join(src_path, img_name), os.path.join(dst_dir, dst_img_name))
 
if __name__ == '__main__':
    src_train_path = './cuhk03-np/detected/bounding_box_train'
    src_query_path = './cuhk03-np/detected/query'
    src_test_path = './cuhk03-np/detected/bounding_box_test'
    dst_dir = './reID/market1501/bounding_box_train'
 
    extract_cuhk03(src_train_path, dst_dir)
    extract_cuhk03(src_query_path, dst_dir)
    extract_cuhk03(src_test_path, dst_dir)

第四步：MSMT17数据集抽取

链接：https://pan.baidu.com/s/1EKmiYw9ZltvzJUAlYd06fQ
提取码：abg3
MSMT17一共有 126441 张图片， ID从002969到007069一共1467个不同ID的行人。

import re
import os
import shutil


def msmt2market(dir_path, dst_dir, prev_pid):
    img_names = os.listdir(dir_path)
    pattern = re.compile(r'([-\d]+)_c([-\d]+)_([\d]+)')
    for img_name in img_names:
        # 判断是否是jpg格式的图片
        if '.jpg' not in img_name:
            continue
        print(img_name)
        # pid: 每个人的标签编号 1
        # _  : 摄像头号 2
        pid, camid, fname = map(int, pattern.search(img_name).groups())
        print(pid)
        # 去掉没用的图片
        if  pid == -1:
            continue
        pid_new = pid + 1 + prev_pid
        dst_img_name = str(pid_new).zfill(6) + '_c' + str(camid) + '_MSMT' + str(fname) + '.jpg'
        print(dst_img_name)
        shutil.copy(os.path.join(dir_path, img_name),os.path.join(dst_dir, dst_img_name))



if __name__ == '__main__':
    src_train_path = './MSMT17/bounding_box_train'
    src_query_path = './MSMT17/query'
    src_test_path = './MSMT17/bounding_box_test'


    dst_dir = './reID/market1501/bounding_box_train'
    msmt2market(src_train_path, dst_dir, 2968)
    msmt2market(src_query_path, dst_dir, 4009)
    msmt2market(src_test_path, dst_dir, 4009)

第五步：viper数据集抽取

链接：https://pan.baidu.com/s/1J6FAuse1VeFGurWQ7EOpxQ
提取码：1vsg
转换后的viper数据集一共有1264张图片， ID从007070到007943一共1467个不同ID的行人。需要注意这里ID不是连续的，不过只要ID跟之前不重复即可

import re
import os
import shutil
 
def extract_viper(src_path, dst_dir, camid=1):
    img_names = os.listdir(src_path)
    pattern = re.compile(r'([\d]+)_([\d]+)')
    pid_container = set()
    for img_name in img_names:
        if '.bmp' not in img_name:
            continue
        print(img_name)
        pid, fname = map(int, pattern.search(img_name).groups())
        # 这里注意需要加上前面的数据集的最后一个ID 7069
        # 由于viper数据集ID是从0开始，因此需要+1
        pid += 7069 + 1
        dst_img_name = str(pid).zfill(6) + '_c' + str(camid) + '_viper' + str(fname) + '.jpg'
        shutil.copy(os.path.join(src_path, img_name), os.path.join(dst_dir, dst_img_name))
 
if __name__ == '__main__':
    src_cam_a = './VIPeR/cam_a'
    src_cam_b = './VIPeR/cam_b'
    dst_dir = './reID/market1501/bounding_box_train'
 
    extract_viper(src_cam_a, dst_dir, camid=1)
    extract_viper(src_cam_b, dst_dir, camid=2)

第六步：prid数据集抽取

链接：https://pan.baidu.com/s/1tkjzN_-g-GwmSY7eCUPisw
提取码：4ttv
转换后的prid数据集一共有2268张图片

import re
import os
import shutil


def extract_prid(src_path, dst_dir, prevID, camid=1):
    pattern = re.compile(r'person_([\d]+)')
    pid_container = set()

    sub_dir_names = os.listdir(src_path)  # ['person_0001', 'person_0002',...

    for sub_dir_name in sub_dir_names:  # 'person_0001'
        img_names_all = os.listdir(os.path.join(src_path, sub_dir_name))
        # 这里我就只取首尾两张，防止重复太多了
        img_names = [img_names_all[0], img_names_all[-1]]
        for img_name in img_names:  # '0001.png'
            if '.png' not in img_name:
                continue
            print(img_name)
            # parent.split('\\')[-1] : person_0001
            pid = int(pattern.search(sub_dir_name).group(1))
            pid += prevID
            print(pid)
            dst_img_name = str(pid).zfill(6) + '_c' + str(camid) + '_prid' + img_name.replace('.png', '.jpg')
            print(dst_img_name)
            shutil.copy(os.path.join(src_path, sub_dir_name, img_name), os.path.join(dst_dir, dst_img_name))


if __name__ == '__main__':
    src_cam_a = './prid_2011/multi_shot/cam_a'
    src_cam_b = './prid_2011/multi_shot/cam_b'
    dst_dir = './reID/market1501/bounding_box_train'

    extract_prid(src_cam_a, dst_dir, 7943)
    extract_prid(src_cam_b, dst_dir, 8328)

第七步：ilids数据集抽取

链接：https://pan.baidu.com/s/1FfYx57Zc7iGuCQa1fMRRHA
提取码：yoww
转换后的ilids数据集一共有600张图片

import re
import os
import shutil
 
def extract_ilids(src_path, dst_dir, prevID, camid):
    pattern = re.compile(r'person([\d]+)')
    pid_container = set()
 
    sub_dir_names = os.listdir(src_path)
 
    for sub_dir_name in sub_dir_names:
        img_names = os.listdir(os.path.join(src_path, sub_dir_name))
        for img_name in img_names:
            if '.png' not in img_name:
                continue
            print(img_name)
            pid = int(pattern.search(sub_dir_name).group(1))
            pid += prevID
            dst_img_name = str(pid).zfill(6) + '_c' + str(camid) + '_ilids' + '.jpg'
            shutil.copy(os.path.join(src_path, sub_dir_name, img_name), os.path.join(dst_dir, dst_img_name))
 
if __name__ == '__main__':
    src_cam_a = './iLIDS-VID/i-LIDS-VID/images/cam1'
    src_cam_b = './iLIDS-VID/i-LIDS-VID/images/cam2'
    dst_dir = './reID/market1501/bounding_box_train'
 
    extract_ilids(src_cam_a, dst_dir, 9077, 1)
    extract_ilids(src_cam_b, dst_dir, 9077, 2)

第八步：grid数据集抽取

链接：https://pan.baidu.com/s/1YbQT2px3Em-3KZTs6pLXmA
提取码：2tbc
grid数据集一共有500张图片

import re
import os
import shutil


def extract_grid(src_path, dst_dir, camid=1):
    img_names = os.listdir(src_path)
    pattern = re.compile(r'([\d]+)_')
    pid_container = set()
    for img_name in img_names:
        if '.jpeg' not in img_name:
            continue
        print(img_name)
        pid = int(pattern.search(img_name).group(1))
        if pid == 0:
            continue
        pid += 9396
        print(pid)
        dst_img_name = str(pid).zfill(6) + '_c' + str(camid) + '_grid' + '.jpg'
        shutil.copy(os.path.join(src_path, img_name), os.path.join(dst_dir, dst_img_name))


if __name__ == '__main__':
    src_cam_a = './underground_reid/probe'
    src_cam_b = './underground_reid/gallery'
    dst_dir = './reID/market1501/bounding_box_train'

    extract_grid(src_cam_a, dst_dir, camid=1)
    extract_grid(src_cam_b, dst_dir, camid=2)

第九步：DukeMTMC-reID数据集抽取

链接：https://pan.baidu.com/s/1AviYz5SenijfO5w1TGuEtA
提取码：l0pt

import re
import os
import shutil


def extract_duke(src_path, dst_dir):
    img_names = os.listdir(src_path)
    pattern = re.compile(r'([-\d]+)_c(\d)_f([\d]+)')
    for img_name in img_names:
        if '.png' not in img_name and '.jpg' not in img_name:
            continue
        print(img_name)
        # pid: 每个人的标签编号 1
        # camid  : 摄像头号 2
        pid, camid, fname = map(int, pattern.search(img_name).groups())
        # 这里注意需要加上前面的market1501数据集的最后一个ID 1501
        # 在前面数据集的最后那个ID基础上继续往后排
        pid += 9646
        print( pid, camid, fname)
        dst_img_name = str(pid).zfill(6) + '_c' + str(camid) + '_Duke' + str(fname) + '.jpg'
        print(dst_img_name)
        shutil.copy(os.path.join(src_path, img_name), os.path.join(dst_dir, dst_img_name))


if __name__ == '__main__':
    src_train_path = './DukeMTMC-reID/DukeMTMC-reID/bounding_box_train'
    src_test_path ='./DukeMTMC-reID/DukeMTMC-reID/bounding_box_test'
    src_query_path = './DukeMTMC-reID/DukeMTMC-reID/query'
    dst_dir = './9'

    extract_duke(src_train_path, dst_dir)

    extract_duke(src_test_path, dst_dir)

第九步：SenseReID数据集抽取

import re
import os
import shutil
 
def extract_SenseReID(src_path, dst_dir, fname):
    img_names = os.listdir(src_path)
    pattern = re.compile(r'([\d]+)_([\d]+)')
    pid_container = set()
    for img_name in img_names:
        if '.jpg' not in img_name:
            continue
        print(img_name)
        pid, camid = map(int, pattern.search(img_name).groups())
        pid += 16786+ 1            
        dst_img_name = str(pid).zfill(6) + '_c' + str(camid + 1) + '_SenseReID_' + fname + '.jpg'
        shutil.copy(os.path.join(src_path, img_name), os.path.join(dst_dir, dst_img_name))
 
if __name__ == '__main__':
    src_cam_a = r'D:\data\SenseReID\test_gallery'
    src_cam_b = r'D:\data\SenseReID\test_probe'
    dst_dir = r'E:\reID\market1501\bounding_box_train'
 
    extract_SenseReID(src_cam_a, dst_dir, 'gallery')
    extract_SenseReID(src_cam_b, dst_dir, 'probe')

代码修改

在market1501.py脚本修改如下代码：

# 在41行左右
# data_dir = osp.join(self.data_dir, 'Market-1501-v15.09.15')
data_dir = osp.join(self.data_dir, 'reID/market1501')

# 在84行左右
# assert 0 <= pid <= 1501  # pid == 0 means background
# assert 1 <= camid <= 6
assert 0 <= pid <= 16786  # pid == 0 means background
assert 1 <= camid <= 16

参考链接：
1、行人重识别数据集转换–统一为market1501数据集进行多数据集联合训练
2、行人重识别数据集链接
3、行人重识别多个数据集格式统一为market1501格式