I created this function that can remove from IMG_PATH all files which are not listed in annotation file (to avoid error in databunch) for existing annotations it creates the imageMask:
# lbl_names = get_image_files(pathmask_train)
# #lbl_names[:3]
CATEGORY_NAMES=['row']
ANNOTATION_FILE=ROOT_PATH+'allrows/train.json'#其实也是pathmask_train的str格式,为方便与资料网站对应
#ANNOTATION_FILE
coco = coco.COCO(ANNOTATION_FILE)
#coco
catIds = coco.getCatIds(catNms=CATEGORY_NAMES);
imgIds = coco.getImgIds(catIds=catIds);
imgDict = coco.loadImgs(imgIds)
len(imgIds) , len(catIds)
imgDF = pd.DataFrame.from_dict(imgDict)
#imgDF[:3]
def dataPreparation():
deleted = 0
processed = 0
imgCounter = 0
for f in listdir(IMG_PATH):
imgCounter = imgCounter + 1
if(imgCounter == IMG_COUNT_LIMIT):
break
df = imgDF[imgDF['file_name']==f]
if(df.empty):
#print("delete file: "+f)
os.remove(IMG_PATH/f)
deleted = deleted + 1
else:
createImageForMask(f)
processed = processed + 1
print('deleted '+str(deleted)+' files')
print('processed '+str(processed)+' files')