csv to txt

这段代码将图像数据集划分为训练、验证和测试集,并生成对应的txt标注文件。利用pandas、shutil和tqdm等库,对CSV中的图像信息进行处理,创建新的数据集结构,同时调整了背景图的比例。最终,该工具适用于训练模型的场景。
摘要由CSDN通过智能技术生成

1、原版

import pandas as pd
import os
from pathlib import Path
import shutil
from tqdm.notebook import tqdm

full_images = "F:/BaiduNetdiskDownload/正式数据/2022.04.06(正式数据)/train_image"


def create_txt_file(path: Path, item):
    """根据 item 生成 txt 文件,并写入在对应的 path
    """
    if not item.pets_name != '无':
        return
    anno_str = []
    w = item.r_x - item.l_x
    h = item.l_y - item.r_y
    x, y, w, h = item.c_x / 5472, item.c_y / 3648, w / 5472, h / 3648

    # 有部分框超出边界
    h = min((1 - y), h)
    w = min((1 - x), w)

    xc = x + w / 2
    yc = y + h / 2
    name = item.pets_name
    anno_str.append(f"{name} {xc} {yc} {w} {h}")
    path.write_text("\n".join(anno_str))


def generate_new_dataset(dest_dir, bg_ratio):


    train_csv = "F:/BaiduNetdiskDownload/正式数据/2022.04.06(正式数据)/train.csv"
    val_csv = "F:/BaiduNetdiskDownload/正式数据/2022.04.06(正式数据)/test.csv"
    # mkdir(parents=True, exist_ok=True)
    # 是否创建父目录,父目录不存在建立,存在赴约
    Path(dest_dir).mkdir(exist_ok=True)
    # cfg文档
    cfg = f"""
path: {dest_dir}
train: images/train2017
val: images/val2017
test: images/test2017
nc: 28
names: ['patric']
"""
    # 以yaml格式写入datast.yaml
    cfg_file = Path(dest_dir) / "dataset.yaml"
    cfg_file.write_text(cfg)

    # 读取train.csv文件
    train_df = pd.read_csv(train_csv, encoding='gb18030')
    # train.csv文件中有对象的
    train_df_with_gt = train_df[train_df.pets_name != '无']
    # 背景图默认只占整个训练集的0.1,因此是带标签的10%作为背景图数量 这儿是101,然后将这个背景图与训练图concat起来
    bg_number = int(len(train_df_with_gt) * bg_ratio)
    # sample(num),会在无标记中抽取101个无标记
    train_df_without_gt = train_df[train_df.pets_name == '无'].sample(bg_number)
    train_df = pd.concat([train_df_without_gt, train_df_with_gt])
    # 创建字典 dfs['train'] =train_df
    dfs = {
        "train": train_df,
        "val": pd.read_csv(val_csv, encoding='gb18030'),
        "test": pd.read_csv(val_csv, encoding='gb18030')
    }

    # 下面还有个小for,第一次mode是train
    for mode in ["train", "val", "test"]:
        image_folder = Path(dest_dir) / "images" / f"{mode}2017"
        shutil.rmtree(image_folder, ignore_errors=True)
        image_folder.mkdir(parents=True, exist_ok=True)

        label_folder = Path(dest_dir) / "labels" / f"{mode}2017"
        shutil.rmtree(label_folder, ignore_errors=True)
        label_folder.mkdir(parents=True, exist_ok=True)

        df = dfs[mode]
        #  item:取上面df=train_df的第一个数119,在df里面进行迭代
        #  total表示总的项目=1120
        #  后面tqdm有两个值,而我们只需要一个值,因此把不用的df.iterrows()赋值给_即可
        for _, item in tqdm(df.iterrows(), total=len(df)):
    # 接下来干两件事,将train_image照片循环一次复制给train、test、val
        # 还有将对应的txt复制到label下
            # file_name = item.filename
            # # 图片
            # shutil.copy(item.image_path, image_folder / f"{file_name}.jpg")
            # # label
            # create_txt_file(Path(label_folder) / f"{file_name}.txt", item)
    # 获取item属性那种具体位置,然后复制到后面这个文件夹

            shutil.copy(f"{full_images}/{item.filename}", f"{image_folder}/{item.filename}")
            create_txt_file(Path(label_folder) / f"{item.filename}.txt", item)



dest_dir ="F:/BaiduNetdiskDownload/正式数据/2022.04.06(正式数据)/Datasets"
generate_new_dataset(dest_dir, bg_ratio=0.1)

2、改动版

import pandas as pd
import os
from pathlib import Path
import shutil
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold

full_images = "F:/BaiduNetdiskDownload/正式数据/2022.04.06(正式数据)/train_image"


def create_txt_file(path: Path, item):
    """根据 item 生成 txt 文件,并写入在对应的 path
    """
    if not item.pets_name != '无':
        return
    anno_str = []
    w = item.r_x - item.l_x
    h = item.l_y - item.r_y
    x, y, w, h = item.c_x / 5472, item.c_y / 3648, w / 5472, h / 3648

    # 有部分框超出边界
    h = min((1 - y), h)
    w = min((1 - x), w)

    xc = x + w / 2
    yc = y + h / 2
    name = item.pets_name
    anno_str.append(f"{name} {xc} {yc} {w} {h}")
    path.write_text("\n".join(anno_str))


def generate_new_dataset(dest_dir, bg_ratio):
     # update begin
    all_csv = "F:/BaiduNetdiskDownload/正式数据/2022.04.06(正式数据)/train.csv"
    all_csv = pd.read_csv(all_csv, encoding='gb18030')
    k = 2
    kf = KFold(n_splits=k, shuffle=True)
    for train_index, test_index in kf.split(all_csv):
        print('train_index:%s , test_index: %s ' % (train_index, test_index))
        train_csv = all_csv.iloc[train_index]
        val_csv = all_csv.iloc[test_index]
    # update end
        # mkdir(parents=True, exist_ok=True)
        # 是否创建父目录,父目录不存在建立,存在赴约
        Path(dest_dir).mkdir(exist_ok=True)
        # cfg文档
        cfg = f"""
    path: {dest_dir}
    train: images/train2017
    val: images/val2017
    test: images/test2017
    nc: 1
    names: ['patric']
    """
        # 以yaml格式写入datast.yaml
        cfg_file = Path(dest_dir) / "dataset.yaml"
        cfg_file.write_text(cfg)

        # 读取train.csv文件
        train_df = train_csv
        # train.csv文件中有对象的
        train_df_with_gt = train_df[train_df.pets_name != '无']
        # 背景图默认只占整个训练集的0.1,因此是带标签的10%作为背景图数量 这儿是101,然后将这个背景图与训练图concat起来
        bg_number = int(len(train_df_with_gt) * bg_ratio)
        # sample(num),会在无标记中抽取101个无标记
        train_df_without_gt = train_df[train_df.pets_name == '无'].sample(bg_number)
        train_df = pd.concat([train_df_without_gt, train_df_with_gt])
        # 创建字典 dfs['train'] =train_df
        dfs = {
            "train": train_df,
            "val":  val_csv,
            "test":  val_csv
        }

        # 下面还有个小for,第一次mode是train
        for mode in ["train", "val", "test"]:
            image_folder = Path(dest_dir) / "images" / f"{mode}2017"
            shutil.rmtree(image_folder, ignore_errors=True)
            image_folder.mkdir(parents=True, exist_ok=True)

            label_folder = Path(dest_dir) / "labels" / f"{mode}2017"
            shutil.rmtree(label_folder, ignore_errors=True)
            label_folder.mkdir(parents=True, exist_ok=True)

            df = dfs[mode]
            #  item:取上面df=train_df的第一个数119,在df里面进行迭代
            #  total表示总的项目=1120
            #  后面tqdm有两个值,而我们只需要一个值,因此把不用的df.iterrows()赋值给_即可
            for _, item in tqdm(df.iterrows(), total=len(df)):
        # 接下来干两件事,将train_image照片循环一次复制给train、test、val
            # 还有将对应的txt复制到label下
                # file_name = item.filename
                # # 图片
                # shutil.copy(item.image_path, image_folder / f"{file_name}.jpg")
                # # label
                # create_txt_file(Path(label_folder) / f"{file_name}.txt", item)
        # 获取item属性那种具体位置,然后复制到后面这个文件夹
                # 这句话就是复制每个excel下对应的图片
                shutil.copy(f"{full_images}/{item.filename}", f"{image_folder}/{item.filename}")
                create_txt_file(Path(label_folder) / f"{item.filename}.txt", item)
    print('aaaaaaaaaaaaaaaa')



dest_dir ="F:/BaiduNetdiskDownload/正式数据/2022.04.06(正式数据)/Datasets"
generate_new_dataset(dest_dir, bg_ratio=0.1)

3、再来一版–融进去能训练的

ull_images = "F:/BaiduNetdiskDownload/AAA/2022.04.06/train_image"
dest_dir = 'F:/BaiduNetdiskDownload/AAA/2022.04.06/datasets'
def create_txt_file(path: Path, item):
    """根据 item 生成 txt 文件,并写入在对应的 path
    """
    if not item.pets_name != '[]':
        return
    anno_str = []
    w = item.r_x - item.l_x
    h = item.r_y - item.l_y
    x, y, w, h = item.c_x / 5472, item.c_y / 3648, w / 5472, h / 3648

    # # 有部分框超出边界
    # h = min((1 - y), h)
    # w = min((1 - x), w)
    #
    # xc = x + w / 2
    # yc = y + h / 2
    name = item.pets_name
    anno_str.append(f"{name} {x} {y} {w} {h}")
    path.write_text("\n".join(anno_str))


def generate_new_dataset(dest_dir, bg_ratio):
    train_csv = "F:/BaiduNetdiskDownload/AAA/2022.04.06/train.csv"
    val_csv = "F:/BaiduNetdiskDownload/AAA/2022.04.06/test.csv"
    # mkdir(parents=True, exist_ok=True)
    # 是否创建父目录,父目录不存在建立,存在赴约
    Path(dest_dir).mkdir(exist_ok=True)
# cfg文档
    dd = 'F:/BaiduNetdiskDownload/AAA/2022.04.06/yolov5-master/datasets'
    cfg = f"""
path: {dd}
train: images/train2017
val: images/val2017
test: images/test2017
nc: 28
names: ['0', '1', '2', '3', '4', '5', '6', '7',
        '8', '9', '10', '11', '12', '13', '14', 
        '15', '16', '17', '18', '19', '20', '21', 
        '22', '23', '24', '25', '26', '27']
"""
    # 以yaml格式写入datast.yaml
    cfg_file = Path(dest_dir) / "dataset.yaml"
    cfg_file.write_text(cfg)

    # 读取train.csv文件
    train_df = pd.read_csv(train_csv, encoding='gb18030')
    # train.csv文件中有对象的
    train_df_with_gt = train_df[train_df.pets_name != '[]']
    # 背景图默认只占整个训练集的0.1,因此是带标签的10%作为背景图数量 这儿是101,然后将这个背景图与训练图concat起来
    bg_number = int(len(train_df_with_gt) * bg_ratio)
    # sample(num),会在无标记中抽取101个无标记
    train_df_without_gt = train_df[train_df.pets_name == '[]'].sample(bg_number)
    train_df = pd.concat([train_df_without_gt, train_df_with_gt])
    # 创建字典 dfs['train'] =train_df
    dfs = {
        "train": train_df,
        "val":  pd.read_csv(val_csv, encoding='gb18030'),
        "test": pd.read_csv(val_csv, encoding='gb18030'),
    }

    # 下面还有个小for,第一次mode是train
    for mode in ["train", "val", "test"]:
        image_folder = Path(dest_dir) / "images" / f"{mode}2017"
        shutil.rmtree(image_folder, ignore_errors=True)
        image_folder.mkdir(parents=True, exist_ok=True)

        label_folder = Path(dest_dir) / "labels" / f"{mode}2017"
        shutil.rmtree(label_folder, ignore_errors=True)
        label_folder.mkdir(parents=True, exist_ok=True)

        df = dfs[mode]
        #  item:取上面df=train_df的第一个数119,在df里面进行迭代
        #  total表示总的项目=1120
        #  后面tqdm有两个值,而我们只需要一个值,因此把不用的df.iterrows()赋值给_即可
        for _, item in tqdm(df.iterrows(), total=len(df)):
    # 接下来干两件事,将train_image照片循环一次复制给train、test、val
        # 还有将对应的txt复制到label下
            # file_name = item.filename
            # # 图片
            # shutil.copy(item.image_path, image_folder / f"{file_name}.jpg")
            # # label
            # create_txt_file(Path(label_folder) / f"{file_name}.txt", item)
    # 获取item属性那种具体位置,然后复制到后面这个文件夹
            # 这句话就是复制每个excel下对应的图片
            shutil.copy(f"{full_images}/{item.filename}", f"{image_folder}/{item.filename}")
            new_filename = item.filename.strip('.jpg')
            create_txt_file(Path(label_folder) / f"{new_filename }.txt", item)


主要逻辑是:
将一个总的图像库成train,val,test三个文件夹,在一个大循环里主要做了两件事,分别对train,val,test中每个item复制图片,根据excel创建txt

参考:https://github.com/louis-she/reef-solution

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值