源码url:
https://github.com/layumi/Person_reID_baseline_pytorch/blob/master/prepare.py
读数据处理代码注释如下:
import os
import shutil # shutil模块提供了许多关于文件和文件集合的高级操作,特别提供了支持文件复制和删除的功能
# You only need to change this line to your dataset download path
download_path = 'F:/market1501' # 数据集绝对路径
if not os.path.isdir(download_path): # os.path.isdir()用于判断对象是否为一个目录,需要绝对路径,返回bool值
print('please change the download_path') # 地址不存在则print log
save_path = download_path + '/pytorch' # 数据读取目录在market1501下的market1501/pytorch
if not os.path.isdir(save_path):
os.mkdir(save_path) # 确认save_path是否存在,不存在创建save_path目录
#-----------------------------------------
#query
# query中每个camera id只有一个query image
query_path = download_path + '/query' # 原数据集的query_path
query_save_path = download_path + '/pytorch/query' # 预设新的query_save_path
if not os.path.isdir(query_save_path):
os.mkdir(query_save_path) # 判断query_save_path是否存在,若不存在则创建query_save_path目录
for root, dirs, files in os.walk(query_path, topdown=True): # os.walk()用于通过在目录树中游走输出在目录中的文件名,向上或向下
# root为os.walk的输入query_path,dirs为空列表
for name in files: # files为query_path的所有文件文件名(字符串)列表,market1501 query有3369个,所以len(files)=3369
# 循环遍历所有图片名字(字符串)
if not name[-3:]=='jpg':
continue # 如果有文件(图片)名后缀不是jpg,继续(这句话不知道是要干嘛)
ID = name.split('_') # 用'_'分割name的ReID信息成多个字符串,输出ID = List['string1', 'string2'..]
src_path = query_path + '/' + name # 得到每个图片的绝对路径src_path
dst_path = query_save_path + '/' + ID[0] # ID[0]为行人编号id,构造以行人编号id为文件名,query_save_path为路径的目录dst_path
if not os.path.isdir(dst_path):
os.mkdir(dst_path) # 若dst_path不存在,不存在创建dst_path目录,以行人编号id为文件夹名字
embed()
shutil.copyfile(src_path, dst_path + '/' + name) # 将绝对路径为src_path的文件内容(无元数据)复制到名为
# dst_path + '/' + name的文件中。即将market1501/query目录的图
# 复制到market1501/pytorch/query/id编号的目录中
#-----------------------------------------
#multi-query
# gt_bbox中一个camera id有多个query image
query_path = download_path + '/gt_bbox'
# for dukemtmc-reid, we do not need multi-query
if os.path.isdir(query_path):
query_save_path = download_path + '/pytorch/multi-query'
if not os.path.isdir(query_save_path):
os.mkdir(query_save_path)
for root, dirs, files in os.walk(query_path, topdown=True):
for name in files:
if not name[-3:]=='jpg':
continue
ID = name.split('_')
src_path = query_path + '/' + name
dst_path = query_save_path + '/' + ID[0]
if not os.path.isdir(dst_path):
os.mkdir(dst_path)
shutil.copyfile(src_path, dst_path + '/' + name)
#-----------------------------------------
#gallery
gallery_path = download_path + '/bounding_box_test' # 取gallery原路径
gallery_save_path = download_path + '/pytorch/gallery' # 取gallery要重新保存的新路径
if not os.path.isdir(gallery_save_path):
os.mkdir(gallery_save_path) # 若没有gallery新路径,则创建一个
for root, dirs, files in os.walk(gallery_path, topdown=True): # 游走gallery_path路径遍历所有图片
for name in files: # 循环遍历所有图片名字(字符串)
if not name[-3:]=='jpg':
continue
ID = name.split('_') # 以'_'分割名字信息
src_path = gallery_path + '/' + name # 取得源路径图片的绝对路径
dst_path = gallery_save_path + '/' + ID[0] # 取得需要复制的目标id路径的目录
if not os.path.isdir(dst_path):
os.mkdir(dst_path) # 创建需要复制的目标id路径
shutil.copyfile(src_path, dst_path + '/' + name) # 将原路径图片复制到以id分类的目标路径
#---------------------------------------
#train_all
# 将train_path中的所有图片复制到新目录train_save_path
train_path = download_path + '/bounding_box_train'
train_save_path = download_path + '/pytorch/train_all'
if not os.path.isdir(train_save_path):
os.mkdir(train_save_path)
for root, dirs, files in os.walk(train_path, topdown=True):
for name in files:
if not name[-3:]=='jpg':
continue
ID = name.split('_')
src_path = train_path + '/' + name
dst_path = train_save_path + '/' + ID[0]
if not os.path.isdir(dst_path):
os.mkdir(dst_path)
shutil.copyfile(src_path, dst_path + '/' + name)
#---------------------------------------
#train_val
# 将train_path中图片分为train和val,分别复制到train_save_path, val_save_path
train_path = download_path + '/bounding_box_train' # train_path的原目录
train_save_path = download_path + '/pytorch/train' # train的目标目录
val_save_path = download_path + '/pytorch/val' # val(验证集)的目标目录
if not os.path.isdir(train_save_path): # 如果目标目录不存在,则创建之
os.mkdir(train_save_path)
os.mkdir(val_save_path)
for root, dirs, files in os.walk(train_path, topdown=True): # 在train_path目录游走
for name in files: # 取图片文件名遍历
if not name[-3:]=='jpg':
continue
ID = name.split('_') # 分离出图片文件名以'_'分开的各个成分
src_path = train_path + '/' + name # 取原目录的文件名绝对路径
dst_path = train_save_path + '/' + ID[0] # 得到train以id编号划分为目录的绝对路径
if not os.path.isdir(dst_path): # 如果train id编号目录不存在,则创建之
os.mkdir(dst_path)
dst_path = val_save_path + '/' + ID[0] # first image is used as val image
os.mkdir(dst_path) # 创建val的id编号目录
shutil.copyfile(src_path, dst_path + '/' + name) # 刚开始创建的id编号目录时,dst_path指向val中的id编号目录
# 所以同id的第一张图片会先给val,从第二张图片开始,train id编号目录
# 已经创建,所以dst_path不会指向val而会指向train的id编号目录,
# 将同一个id下除第一张以外的所有图片复制给train id编号目录。