1、原版
import pandas as pd
import os
from pathlib import Path
import shutil
from tqdm.notebook import tqdm
full_images = "F:/BaiduNetdiskDownload/正式数据/2022.04.06(正式数据)/train_image"
def create_txt_file(path: Path, item):
"""根据 item 生成 txt 文件,并写入在对应的 path
"""
if not item.pets_name != '无':
return
anno_str = []
w = item.r_x - item.l_x
h = item.l_y - item.r_y
x, y, w, h = item.c_x / 5472, item.c_y / 3648, w / 5472, h / 3648
# 有部分框超出边界
h = min((1 - y), h)
w = min((1 - x), w)
xc = x + w / 2
yc = y + h / 2
name = item.pets_name
anno_str.append(f"{name} {xc} {yc} {w} {h}")
path.write_text("\n".join(anno_str))
def generate_new_dataset(dest_dir, bg_ratio):
train_csv = "F:/BaiduNetdiskDownload/正式数据/2022.04.06(正式数据)/train.csv"
val_csv = "F:/BaiduNetdiskDownload/正式数据/2022.04.06(正式数据)/test.csv"
# mkdir(parents=True, exist_ok=True)
# 是否创建父目录,父目录不存在建立,存在赴约
Path(dest_dir).mkdir(exist_ok=True)
# cfg文档
cfg = f"""
path: {dest_dir}
train: images/train2017
val: images/val2017
test: images/test2017
nc: 28
names: ['patric']
"""
# 以yaml格式写入datast.yaml
cfg_file = Path(dest_dir) / "dataset.yaml"
cfg_file.write_text(cfg)
# 读取train.csv文件
train_df = pd.read_csv(train_csv, encoding='gb18030')
# train.csv文件中有对象的
train_df_with_gt = train_df[train_df.pets_name != '无']
# 背景图默认只占整个训练集的0.1,因此是带标签的10%作为背景图数量 这儿是101,然后将这个背景图与训练图concat起来
bg_number = int(len(train_df_with_gt) * bg_ratio)
# sample(num),会在无标记中抽取101个无标记
train_df_without_gt = train_df[train_df.pets_name == '无'].sample(bg_number)
train_df = pd.concat([train_df_without_gt, train_df_with_gt])
# 创建字典 dfs['train'] =train_df
dfs = {
"train": train_df,
"val": pd.read_csv(val_csv, encoding='gb18030'),
"test": pd.read_csv(val_csv, encoding='gb18030')
}
# 下面还有个小for,第一次mode是train
for mode in ["train", "val", "test"]:
image_folder = Path(dest_dir) / "images" / f"{mode}2017"
shutil.rmtree(image_folder, ignore_errors=True)
image_folder.mkdir(parents=True, exist_ok=True)
label_folder = Path(dest_dir) / "labels" / f"{mode}2017"
shutil.rmtree(label_folder, ignore_errors=True)
label_folder.mkdir(parents=True, exist_ok=True)
df = dfs[mode]
# item:取上面df=train_df的第一个数119,在df里面进行迭代
# total表示总的项目=1120
# 后面tqdm有两个值,而我们只需要一个值,因此把不用的df.iterrows()赋值给_即可
for _, item in tqdm(df.iterrows(), total=len(df)):
# 接下来干两件事,将train_image照片循环一次复制给train、test、val
# 还有将对应的txt复制到label下
# file_name = item.filename
# # 图片
# shutil.copy(item.image_path, image_folder / f"{file_name}.jpg")
# # label
# create_txt_file(Path(label_folder) / f"{file_name}.txt", item)
# 获取item属性那种具体位置,然后复制到后面这个文件夹
shutil.copy(f"{full_images}/{item.filename}", f"{image_folder}/{item.filename}")
create_txt_file(Path(label_folder) / f"{item.filename}.txt", item)
dest_dir ="F:/BaiduNetdiskDownload/正式数据/2022.04.06(正式数据)/Datasets"
generate_new_dataset(dest_dir, bg_ratio=0.1)
2、改动版
import pandas as pd
import os
from pathlib import Path
import shutil
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold
full_images = "F:/BaiduNetdiskDownload/正式数据/2022.04.06(正式数据)/train_image"
def create_txt_file(path: Path, item):
"""根据 item 生成 txt 文件,并写入在对应的 path
"""
if not item.pets_name != '无':
return
anno_str = []
w = item.r_x - item.l_x
h = item.l_y - item.r_y
x, y, w, h = item.c_x / 5472, item.c_y / 3648, w / 5472, h / 3648
# 有部分框超出边界
h = min((1 - y), h)
w = min((1 - x), w)
xc = x + w / 2
yc = y + h / 2
name = item.pets_name
anno_str.append(f"{name} {xc} {yc} {w} {h}")
path.write_text("\n".join(anno_str))
def generate_new_dataset(dest_dir, bg_ratio):
# update begin
all_csv = "F:/BaiduNetdiskDownload/正式数据/2022.04.06(正式数据)/train.csv"
all_csv = pd.read_csv(all_csv, encoding='gb18030')
k = 2
kf = KFold(n_splits=k, shuffle=True)
for train_index, test_index in kf.split(all_csv):
print('train_index:%s , test_index: %s ' % (train_index, test_index))
train_csv = all_csv.iloc[train_index]
val_csv = all_csv.iloc[test_index]
# update end
# mkdir(parents=True, exist_ok=True)
# 是否创建父目录,父目录不存在建立,存在赴约
Path(dest_dir).mkdir(exist_ok=True)
# cfg文档
cfg = f"""
path: {dest_dir}
train: images/train2017
val: images/val2017
test: images/test2017
nc: 1
names: ['patric']
"""
# 以yaml格式写入datast.yaml
cfg_file = Path(dest_dir) / "dataset.yaml"
cfg_file.write_text(cfg)
# 读取train.csv文件
train_df = train_csv
# train.csv文件中有对象的
train_df_with_gt = train_df[train_df.pets_name != '无']
# 背景图默认只占整个训练集的0.1,因此是带标签的10%作为背景图数量 这儿是101,然后将这个背景图与训练图concat起来
bg_number = int(len(train_df_with_gt) * bg_ratio)
# sample(num),会在无标记中抽取101个无标记
train_df_without_gt = train_df[train_df.pets_name == '无'].sample(bg_number)
train_df = pd.concat([train_df_without_gt, train_df_with_gt])
# 创建字典 dfs['train'] =train_df
dfs = {
"train": train_df,
"val": val_csv,
"test": val_csv
}
# 下面还有个小for,第一次mode是train
for mode in ["train", "val", "test"]:
image_folder = Path(dest_dir) / "images" / f"{mode}2017"
shutil.rmtree(image_folder, ignore_errors=True)
image_folder.mkdir(parents=True, exist_ok=True)
label_folder = Path(dest_dir) / "labels" / f"{mode}2017"
shutil.rmtree(label_folder, ignore_errors=True)
label_folder.mkdir(parents=True, exist_ok=True)
df = dfs[mode]
# item:取上面df=train_df的第一个数119,在df里面进行迭代
# total表示总的项目=1120
# 后面tqdm有两个值,而我们只需要一个值,因此把不用的df.iterrows()赋值给_即可
for _, item in tqdm(df.iterrows(), total=len(df)):
# 接下来干两件事,将train_image照片循环一次复制给train、test、val
# 还有将对应的txt复制到label下
# file_name = item.filename
# # 图片
# shutil.copy(item.image_path, image_folder / f"{file_name}.jpg")
# # label
# create_txt_file(Path(label_folder) / f"{file_name}.txt", item)
# 获取item属性那种具体位置,然后复制到后面这个文件夹
# 这句话就是复制每个excel下对应的图片
shutil.copy(f"{full_images}/{item.filename}", f"{image_folder}/{item.filename}")
create_txt_file(Path(label_folder) / f"{item.filename}.txt", item)
print('aaaaaaaaaaaaaaaa')
dest_dir ="F:/BaiduNetdiskDownload/正式数据/2022.04.06(正式数据)/Datasets"
generate_new_dataset(dest_dir, bg_ratio=0.1)
3、再来一版–融进去能训练的
ull_images = "F:/BaiduNetdiskDownload/AAA/2022.04.06/train_image"
dest_dir = 'F:/BaiduNetdiskDownload/AAA/2022.04.06/datasets'
def create_txt_file(path: Path, item):
"""根据 item 生成 txt 文件,并写入在对应的 path
"""
if not item.pets_name != '[]':
return
anno_str = []
w = item.r_x - item.l_x
h = item.r_y - item.l_y
x, y, w, h = item.c_x / 5472, item.c_y / 3648, w / 5472, h / 3648
# # 有部分框超出边界
# h = min((1 - y), h)
# w = min((1 - x), w)
#
# xc = x + w / 2
# yc = y + h / 2
name = item.pets_name
anno_str.append(f"{name} {x} {y} {w} {h}")
path.write_text("\n".join(anno_str))
def generate_new_dataset(dest_dir, bg_ratio):
train_csv = "F:/BaiduNetdiskDownload/AAA/2022.04.06/train.csv"
val_csv = "F:/BaiduNetdiskDownload/AAA/2022.04.06/test.csv"
# mkdir(parents=True, exist_ok=True)
# 是否创建父目录,父目录不存在建立,存在赴约
Path(dest_dir).mkdir(exist_ok=True)
# cfg文档
dd = 'F:/BaiduNetdiskDownload/AAA/2022.04.06/yolov5-master/datasets'
cfg = f"""
path: {dd}
train: images/train2017
val: images/val2017
test: images/test2017
nc: 28
names: ['0', '1', '2', '3', '4', '5', '6', '7',
'8', '9', '10', '11', '12', '13', '14',
'15', '16', '17', '18', '19', '20', '21',
'22', '23', '24', '25', '26', '27']
"""
# 以yaml格式写入datast.yaml
cfg_file = Path(dest_dir) / "dataset.yaml"
cfg_file.write_text(cfg)
# 读取train.csv文件
train_df = pd.read_csv(train_csv, encoding='gb18030')
# train.csv文件中有对象的
train_df_with_gt = train_df[train_df.pets_name != '[]']
# 背景图默认只占整个训练集的0.1,因此是带标签的10%作为背景图数量 这儿是101,然后将这个背景图与训练图concat起来
bg_number = int(len(train_df_with_gt) * bg_ratio)
# sample(num),会在无标记中抽取101个无标记
train_df_without_gt = train_df[train_df.pets_name == '[]'].sample(bg_number)
train_df = pd.concat([train_df_without_gt, train_df_with_gt])
# 创建字典 dfs['train'] =train_df
dfs = {
"train": train_df,
"val": pd.read_csv(val_csv, encoding='gb18030'),
"test": pd.read_csv(val_csv, encoding='gb18030'),
}
# 下面还有个小for,第一次mode是train
for mode in ["train", "val", "test"]:
image_folder = Path(dest_dir) / "images" / f"{mode}2017"
shutil.rmtree(image_folder, ignore_errors=True)
image_folder.mkdir(parents=True, exist_ok=True)
label_folder = Path(dest_dir) / "labels" / f"{mode}2017"
shutil.rmtree(label_folder, ignore_errors=True)
label_folder.mkdir(parents=True, exist_ok=True)
df = dfs[mode]
# item:取上面df=train_df的第一个数119,在df里面进行迭代
# total表示总的项目=1120
# 后面tqdm有两个值,而我们只需要一个值,因此把不用的df.iterrows()赋值给_即可
for _, item in tqdm(df.iterrows(), total=len(df)):
# 接下来干两件事,将train_image照片循环一次复制给train、test、val
# 还有将对应的txt复制到label下
# file_name = item.filename
# # 图片
# shutil.copy(item.image_path, image_folder / f"{file_name}.jpg")
# # label
# create_txt_file(Path(label_folder) / f"{file_name}.txt", item)
# 获取item属性那种具体位置,然后复制到后面这个文件夹
# 这句话就是复制每个excel下对应的图片
shutil.copy(f"{full_images}/{item.filename}", f"{image_folder}/{item.filename}")
new_filename = item.filename.strip('.jpg')
create_txt_file(Path(label_folder) / f"{new_filename }.txt", item)
主要逻辑是:
将一个总的图像库成train,val,test三个文件夹,在一个大循环里主要做了两件事,分别对train,val,test中每个item复制图片,根据excel创建txt
参考:https://github.com/louis-she/reef-solution