数据集
Fashion Product Images (Small)
44000 products with category labels and images.
目标:所有图片存在images文件夹下,styles.csv为对应的属性,根据articleType属性,将数据分为142类,并分别存在对应的/test(train)/label/
文件夹下。
-> 图片的名称 (1597.jpg)
-> 通过名称在csv中找到对应的class_name (‘Shirts’)
-> 再根据自己建立的对应关系,找到对应的label (‘Shirts’ - 0)
-> 把图片放在对应的label文件夹下 (0)
1. 读取styles.csv
import pandas as pd
import os
import shutil
fashion_data = pd.read_csv('styles.csv', error_bad_lines=False)
articleType = fashion_data['articleType']
ids = fashion_data['id']
2. 数据集被分为多少类?
classes = []
for cur_articleType in articleType:
if cur_articleType not in classes:
classes.append(cur_articleType)
print(len(classes)) # 142
3. 创建folders
classes_dict:
cur_class_name | cur_label |
---|---|
‘Shirts’ | 0 |
‘Jeans’ | 1 |
… | … |
… | 141 |
classes_dict = {}
nb_test_dict = {}
for i in range(len(classes)):
cur_label = i
cur_class = classes[i]
classes_dict[cur_class] = cur_label
# 整体的:
os.makedirs(os.getcwd() + '/img/' + str(cur_label))
# 区分test/train:
os.makedirs(os.getcwd() + '/test/' + str(cur_label))
os.makedirs(os.getcwd() + '/train/' + str(cur_label))
# 初始化,后面计数的时候用:
nb_test_dict[cur_label] = 0
4. 把每一张图片放在对应类文件夹下
for i in range(len(ids)):
cur_id = ids[i]
cur_type = articleType[i]
belongs_to_label = classes_dict[cur_type]
new_path = os.getcwd() + '/img/' + str(belongs_to_label) + '/' + str(cur_id) + '.jpg'
old_path = os.getcwd() + '/images/' + str(cur_id) + '.jpg'
shutil.copyfile(old_path, new_path)
5. 每一类各有多少图片?
nb_dict:
cur_label | nb_of_imgs |
---|---|
0 | 3215 |
1 | 608 |
… | … |
141 | … |
nb_dict = {}
for i in range(len(classes)):
folder_path = os.getcwd() + '/img/' + str(i)
for root, dirs, files in os.walk(folder_path):
cur_label = i
nb_imgs = len(files)
nb_dict[cur_label] = nb_imgs
6. 按比例分为test和train
for i in range(len(ids)):
cur_id = ids[i]
cur_type = articleType[i]
belongs_to_label = classes_dict[cur_type]
# 20%为test:
nb_test = int(nb_dict[belongs_to_label] * 0.2)
# 剩下的都是train:
nb_train = int(nb_dict[belongs_to_label] - nb_test)
new_path_test = os.getcwd() + '/test/' + str(belongs_to_label) + '/' + str(cur_id) + '.jpg'
new_path_train = os.getcwd() + '/train/' + str(belongs_to_label) + '/' + str(cur_id) + '.jpg'
old_path = os.getcwd() + '/images/' + str(cur_id) + '.jpg'
if nb_test_dict[belongs_to_label] <= nb_test:
shutil.copyfile(old_path, new_path_test)
nb_test_dict[belongs_to_label] = nb_test_dict[belongs_to_label] + 1
else:
shutil.copyfile(old_path, new_path_train)