如何修改.json文件的内容?

以下是项目过程中遇到的一些数据清洗,预处理和统计等常规操作,有需要的可以自行借鉴下相关模块代码,转载或引用请注明。

###################################################################################
# -.- coding:utf-8 -.-
# __author__ = 'Jack'
# __date__ = 2019/07/09
# 目录下所需文件:
# 1- menu.xlsx: 存放菜单的Id、Name、Price三个属性
# 2- origin.json: 存放原始标注图片的标注信息
# 3- modify.json: 待检查的json文件
# 操作文档说明:
# 1- 拿到文件先抽样检查,确保整体无误,跟新下menu,xlsx
# 2- 将新增图片放入到not_rename_file文件夹下
# 3- 将相应的json文件命名为modify.json放到与data_process.py文件同级的目录下
# 4- 修改相应的超参数设置(INDEX_START),运行以下程序:
# modify = Modify()
# modify.rename_key()  # 修改json文件相应属性名
# modify.rename_file_name()  # 修改图片文件名
# # modify.remark_label()  # 如需修改标签值则调用此函数
# Check().check_label()
# 5- 运行结束后检查origin.json是否有新增标注数据信息,若出现错误,则根据提示进行修正再次重新运行
# 6- 重复步骤1-5,直至将所有批次的新增数据处理完毕,跳转到7-
# 7- 在主程序中将第4-调用到的所有程序注释掉,确保不被运行到
# 8- 修改相应的超参数(MORE_THAN_NUM),并运行以下程序:
# operate = Operate()
# operate.shuffler_data()
# label_name, _ = operate.feature_map()
# number_per_food_train, number_per_food_val, number_per_food_test, nv, nt = operate.counts(label_name)
# choose_label = \
#     operate.visualize(number_per_food_train, number_per_food_val, number_per_food_test, label_name, nv, nt)
# train_list, val_list, test_list = operate.get_label(choose_label)
# operate.delete_photo('./train', train_list)
# operate.delete_photo('./val', val_list)
# operate.delete_photo('./test', test_list)
# 执行完以上程序需记录“样本数超过?的标签值有?种,即[...]”这两个数据(外加中文标签),并把train/vak/test更新
# 训练程序配置
# (1)在caipinshibie.py文件搜索changes修改相应参数
# (2)在mrcnn文件夹下的model.py文件定位到1711行修改相应的augmentation为None或True
# (3)运行训练文件
###################################################################################
import json
import xmltodict
import pandas as pd
from collections import OrderedDict
import matplotlib.pyplot as plt
import os
import shutil
import datetime
import warnings


# 系统参数设置
plt.rcParams['font.sans-serif'] = ['SimHei']  # 设置中文乱码格式
plt.rcParams['axes.unicode_minus'] = False  # 设置负号正常显示
warnings.filterwarnings("ignore")  # 忽略警告

# 超参数设置
MODIFY_JSON_NAME = 'modify.json'  # 待检查的json文件
ORIGIN_JSON_NAME = 'origin.json'  # 已有的json文件
OUTPUT_JSON_NAME = 'output.json'  # 经筛选所得到的json文件
TRAIN_VAL_DIVIDE_JSON_NAME = 'via_region_data.json'  # 划分训练测试集所得的json文件
MENU_EXCEL_NAME = 'menu.xlsx'  # 菜单,须实时更新
MODIFY_FILE_PATH = './not_rename_file/'  # 待修改文件名图片的存放路径
MOVE_FILE_PATH = './json_file/'  # 此文件夹存放已经处理好的图片
OUTPUT_FILE_PATH = './output_file/'  # 此文件夹存放经过筛选后所得图片

INDEX_START = 955  # 新增图片的起始序号
MORE_THAN_NUM = 70  # 筛选出出现次数大于多少次的菜品


class Modify(object):
    """用于重命名文件或属性"""

    ##############################################################################
    # rename_file_name():批量修改文件名
    # rename_key():重命名 json 文件中的键名和 filename 属性值
    # remark_label():修改标签值
    ##############################################################################

    @staticmethod
    def rename_file_name():
        files = os.listdir(MODIFY_FILE_PATH)
        cnt = INDEX_START
        for file in files:
            old_name = file
            old_path = os.path.join(MODIFY_FILE_PATH, old_name)
            new_name = 'IMG_' + str(cnt).zfill(4) + '.jpg'
            new_path = os.path.join(MODIFY_FILE_PATH, new_name)
            os.rename(old_path, new_path)
            print(old_name, "========>", new_name)
            cnt += 1
        print("Successful to rename filename!")

    @staticmethod
    def rename_key():
        """重命名json,格式为“IMG_0001”"""

        re_dict = {}
        cnt = INDEX_START
        with open(MODIFY_JSON_NAME, 'r') as f:
            json_dict = json.load(f)
            for k, val in json_dict.items():
                rename = "IMG_" + str(cnt).zfill(4) + '.jpg'
                re_dict[rename] = json_dict[k]
                re_dict[rename]['filename'] = rename
                cnt += 1

        with open(MODIFY_JSON_NAME, 'w') as f:
            f.write(json.dumps(re_dict))

    @staticmethod
    def remark_label():
        """修改标签值"""

        with open(ORIGIN_JSON_NAME, 'r') as f:
            # json_dict = json.load(f, object_pairs_hook=OrderedDict)  # 使用有序字典
            json_dict = json.load(f)
            print("修改前的字典顺序:\n", json_dict.keys())
            for k, val in json_dict.items():
                ls = len(json_dict[k]['regions'])
                for l in range(ls):
                    if json_dict[k]['regions'][str(l)]['region_attributes']['label'] == '1315':
                        json_dict[k]['regions'][str(l)]['region_attributes']['label'] = '0812'
                        print(k[4: 8])

        with open(ORIGIN_JSON_NAME, 'w') as f:
            # json_dict = sorted(zip(json_dict.values(), json_dict.keys()))
            print("修改后的字典顺序:\n", json_dict.keys())
            f.write(json.dumps(json_dict))


class Check(object):
    """用于检测标注是否正确,若正确则合并到原始json文件当中"""

    #######################################################################################
    # move_file():将符合条件的图片剪切到总的图片文件夹下
    # check_label():检查标签值是否满足三位的格式及在指定的范围内
    #######################################################################################

    @staticmethod
    def move_file():
        """将符合条件的图片剪切到总的图片文件夹下"""
        files = os.listdir(MODIFY_FILE_PATH)
        for file in files:
            shutil.move(MODIFY_FILE_PATH+file, MOVE_FILE_PATH)

    @staticmethod
    def check_label():
        """检查标签值是否满足三位的格式及在指定的范围内"""

        modifies = []
        data = pd.read_excel('menu.xlsx', skiprows=1)
        max_label = max(data['Id'])
        with open(MODIFY_JSON_NAME, 'r') as f:
            json_dict = json.load(f)
            for k, val in json_dict.items():
                ls = len(json_dict[k]["regions"])
                for l in range(ls):
                    try:
                        if (len(json_dict[k]['regions'][str(l)]['region_attributes']['label']) != 3) or \
                                (int(json_dict[k]['regions'][str(l)]['region_attributes']['label']) < 1) or \
                                (int(json_dict[k]['regions'][str(l)]['region_attributes']['label']) > max_label):
                            modifies.append(k[4: 8] + str(l))
                    except KeyError:
                        modifies.append(k[4: 8] + '_' + str(l))

        if len(modifies) == 0:
            print("所有图片均符合要求!")
            with open(ORIGIN_JSON_NAME, 'r') as o:
                json_origin = json.load(o)
                with open(MODIFY_JSON_NAME, 'r') as m:
                    json_modify = json.load(m)
                    for k, val in json_modify.items():
                        json_origin[k] = val

            with open(ORIGIN_JSON_NAME, 'w') as f:
                f.write(json.dumps(json_origin))
            Check.move_file()
        else:
            print("需要修正的图片序号为:\n", modifies)

    @staticmethod
    def look_index(idx):
        # 寻找包含特定编号的图片,返回一个列表
        indexs = []
        with open(ORIGIN_JSON_NAME, 'r') as f:
            json_dict = json.load(f)
            for k, val in json_dict.items():
                ls = len(json_dict[k]["regions"])
                for l in range(ls):
                    if json_dict[k]['regions'][str(l)]['region_attributes']['label'] == idx:
                        indexs.append(k[4: 8])

        print("包含标签值为{}的图片编号为:".format(idx))
        print(indexs)


class Operate(object):
    """对数据进行一些操作"""

    #######################################################################################
    # shuffler_data():将原始数据集按8:2的操作划分为训练数据集和测试数据集
    # feature_map():返回两个字典:{label: name}和{label: price}
    # counts(name_dict):统计已有菜品的种类数,以及每种菜品出现的次数
    # visualize(var_dict, name_dict):可视化统计结果,返回筛选出来的菜品标签值
    # get_label(lists):提取特定的标签值
    # delete_photo(paths, file_name_list):删除指定序号的图片
    # choose(paths, file_name_list):筛选指定序号的图片
    # divide_train_and_val(start, end):将数据集划分为训练数据集和测试数据集
    # print_name(l, name_dict):输出标签值对应的中文名"
    #######################################################################################

    @staticmethod
    def shuffler_data():
        train_data = {}
        validation_data = {}
        test_data = {}
        with open(ORIGIN_JSON_NAME, 'r', encoding='utf-8') as f:
            json_dict = json.load(f)
            cnt = 1

            for k, val in json_dict.items():
                if cnt % 10 == 3:
                    validation_data[k] = val
                elif cnt % 10 == 7:
                    test_data[k] = val
                else:
                    train_data[k] = val
                cnt = cnt + 1
        with open('./train/via_region_data.json', 'w', encoding='utf-8') as f:
            f.write(json.dumps(train_data, ensure_ascii=False))
        with open('./val/via_region_data.json', 'w', encoding='utf-8') as f:
            f.write(json.dumps(validation_data, ensure_ascii=False))
        with open('./test/via_region_data.json', 'w', encoding='utf-8') as f:
            f.write(json.dumps(test_data, ensure_ascii=False))

        cnt = 1
        files = os.listdir(MOVE_FILE_PATH)
        for file in files:
            if cnt % 10 == 3:
                shutil.copy(MOVE_FILE_PATH+file, './val/')
            elif cnt % 10 == 7:
                shutil.copy(MOVE_FILE_PATH+file, './test/')
            else:
                shutil.copy(MOVE_FILE_PATH+file, './train/')
            cnt = cnt + 1

    @staticmethod
    def feature_map():
        """
        read_path: Excel文件路径
        :return: 映射字典
        """
        data = pd.read_excel(MENU_EXCEL_NAME, skiprows=1)
        labels = list(data['Id'])
        labels = [str(label).zfill(3) for label in labels]  # 标签值
        names = list(data['Name'])  # 菜品名
        prices = list(data['Price'])  # 价格
        print("总标签值:\n", labels)
        print("总菜品名:\n", names)
        print("价格:\n", prices)
        print("-" * 200)
        na_dict = {}  # 每个标签值对应的菜品名
        pri_map = {}  # 每个标签值对应的菜品价格
        for i in range(len(labels)):
            na_dict[labels[i]] = names[i]
            pri_map[labels[i]] = prices[i]

        return na_dict, pri_map

    @staticmethod
    def counts(name_dict):
        """
        统计已有菜品的种类数,以及每种菜品出现的次数
        read_path: 文件路径
        :param name_dict: 从feature_map的输出获得,菜品名映射字典
        :return
            exist_labels 目前拥有的菜品种类
            num_per_food 每种菜品对应的数量
        """

        with open('./train/via_region_data.json', 'r') as f:
            json_dict = json.load(f, object_pairs_hook=OrderedDict)
            total_train = []
            for k, val in json_dict.items():
                ls = len(json_dict[k]['regions'])
                for l in range(ls):
                    total_train.append(json_dict[k]['regions'][str(l)]['region_attributes']['label'])

        with open('./val/via_region_data.json', 'r') as f:
            json_dict = json.load(f, object_pairs_hook=OrderedDict)
            total_val = []
            for k, val in json_dict.items():
                ls = len(json_dict[k]['regions'])
                for l in range(ls):
                    total_val.append(json_dict[k]['regions'][str(l)]['region_attributes']['label'])

        with open('./test/via_region_data.json', 'r') as f:
            json_dict = json.load(f, object_pairs_hook=OrderedDict)
            total_test = []
            for k, val in json_dict.items():
                ls = len(json_dict[k]['regions'])
                for l in range(ls):
                    total_test.append(json_dict[k]['regions'][str(l)]['region_attributes']['label'])

        value_cut_train = {}
        for t in total_train:
            value_cut_train[t] = value_cut_train.get(t, 0) + 1

        value_cut_val = {}
        for t in total_val:
            value_cut_val[t] = value_cut_val.get(t, 0) + 1

        value_cut_test = {}
        for t in total_test:
            value_cut_test[t] = value_cut_test.get(t, 0) + 1

        num_per_food_train = sorted(value_cut_train.items(), key=lambda v: v[1], reverse=False)
        num_per_food_val = sorted(value_cut_val.items(), key=lambda v: v[1], reverse=False)
        num_per_food_test = sorted(value_cut_test.items(), key=lambda v: v[1], reverse=False)

        print('num_per_food_train:\n', num_per_food_train)  # 训练数据集中每种菜品对应的数量
        print('num_per_food_val:\n', num_per_food_val)  # 训练数据集中每种菜品对应的数量
        print('num_per_food_test:\n', num_per_food_test)  # 训练数据集中每种菜品对应的数量
        print('-' * 100)

        exist_labels_train = [v for v in value_cut_train.keys()]
        exist_labels_val = [v for v in value_cut_val.keys()]
        exist_labels_test = [v for v in value_cut_test.keys()]

        print("exist_labels_train:\n", exist_labels_train)
        print("exist_labels_val:\n", exist_labels_val)
        print("exist_labels_test:\n", exist_labels_test)
        print('-' * 100)

        chinese_dict_train = {}
        for i in range(len(exist_labels_train)):
            chinese_dict_train[i] = name_dict[exist_labels_train[i]]
        names_train = list(chinese_dict_train.values())
        names_train.insert(0, '背景')
        print("训练数据集中已有的菜品种类(含背景):\n", names_train)
        print("训练数据集中总的菜品数量为: ", len(exist_labels_train))
        print('-' * 100)

        chinese_dict_val = {}
        for i in range(len(exist_labels_val)):
            chinese_dict_val[i] = name_dict[exist_labels_val[i]]
        names_val = list(chinese_dict_val.values())
        print("验证数据集中已有的菜品种类:\n", names_val)
        print("验证数据集中总的菜品数量为: ", len(exist_labels_val))

        chinese_dict_test = {}
        for i in range(len(exist_labels_test)):
            chinese_dict_test[i] = name_dict[exist_labels_test[i]]
        names_test = list(chinese_dict_test.values())
        print("测试数据集中已有的菜品种类:\n", names_test)
        print("测试数据集中总的菜品数量为: ", len(exist_labels_test))

        return num_per_food_train, num_per_food_val, num_per_food_test, names_val, names_test

    @staticmethod
    def visualize(train_dict, val_dict, test_dict, name_dict, nv, nt):
        """
        数据可视化
        var_dict: 从counts()的输出获得,每种菜品对应的数量num_per_food
        name_dic: 从feature_map()的输出获得,菜品名映射字典
        :return temp: 筛选出来的菜品标签值,用一个列表保存
        """

        # train
        keys, values, labels, names = [], [], [], ['背景']
        for i in range(len(train_dict)):
            if train_dict[i][1] >= MORE_THAN_NUM:
                labels.append(train_dict[i][0])
                names.append(name_dict[train_dict[i][0]])
            keys.append(name_dict[train_dict[i][0]])
            values.append(train_dict[i][1])
        print('训练数据集中样本数超过' + str(MORE_THAN_NUM) + '的标签值有{}种,即\n{}'.format(len(labels), labels))
        print("对应的中文标签为:", names)

        list_nv, list_nt = [], []
        for n in names:
            if n not in nv and n != '背景':
                list_nv.append(n)
            if n not in nt and n != '背景':
                list_nt.append(n)
        if len(list_nv) != 0:
            print("验证集中缺少目标菜品有:", list_nv)
        else:
            print("验证集中覆盖了所有的目标")
        if len(list_nt) != 0:
            print("测试集中缺少目标菜品有:", list_nt)
        else:
            print("测试集中覆盖了所有的目标")

        plt.figure(figsize=(12, 12))
        # plt.subplots_adjust(left=0.09, right=1, wspace=0.25, hspace=0.25, bottom=0.13, top=0.91)
        plt.barh(keys, values, color='steelblue', alpha=0.8)
        plt.yticks(fontsize=5)
        plt.title(str(datetime.date.today()) + u"训练数据菜品数量分布图")
        plt.ylabel(u'菜品种类')
        plt.xlabel(u'菜品数量')
        plt.tight_layout()
        for x, y in enumerate(values):
            plt.text(y + 0.3, x - 0.3, '%s' % y, fontsize=8)
        plt.savefig('./statistics_graph/' + str(datetime.date.today()) + u"训练数据菜品数量分布图" + '.png')
        plt.show()

        # val
        keys, values = [], []
        for i in range(len(val_dict)):
            keys.append(name_dict[val_dict[i][0]])
            values.append(val_dict[i][1])
        plt.figure(figsize=(12, 12))
        plt.barh(keys, values, color='steelblue', alpha=0.8)
        plt.yticks(fontsize=5)
        plt.title(str(datetime.date.today()) + u"验证数据菜品数量分布图")
        plt.ylabel(u'菜品种类')
        plt.xlabel(u'菜品数量')
        plt.tight_layout()
        for x, y in enumerate(values):
            plt.text(y + 0.3, x - 0.3, '%s' % y, fontsize=8)
        plt.savefig('./statistics_graph/' + str(datetime.date.today()) + u"验证数据菜品数量分布图" + '.png')
        plt.show()

        # test
        keys, values = [], []
        for i in range(len(test_dict)):
            keys.append(name_dict[test_dict[i][0]])
            values.append(test_dict[i][1])
        plt.figure(figsize=(12, 12))
        plt.barh(keys, values, color='steelblue', alpha=0.8)
        plt.yticks(fontsize=5)
        plt.title(str(datetime.date.today()) + u"测试数据菜品数量分布图")
        plt.ylabel(u'菜品种类')
        plt.xlabel(u'菜品数量')
        plt.tight_layout()
        for x, y in enumerate(values):
            plt.text(y + 0.3, x - 0.3, '%s' % y, fontsize=8)
        plt.savefig('./statistics_graph/' + str(datetime.date.today()) + u"测试数据菜品数量分布图" + '.png')
        plt.show()

        return labels

    @staticmethod
    def filter_data(json_path, filter_list, types):
        choice_dict = {}
        idx = []
        with open(json_path, 'r') as f:
            json_dict = json.load(f)
            nums = len(json_dict)
            for k, val in json_dict.items():
                cnt = 0  # 统计出现目标标签的次数
                ls = len(json_dict[k]["regions"])
                for l in range(ls):
                    if json_dict[k]["regions"][str(l)]["region_attributes"]["label"] in filter_list:
                        cnt += 1
                    else:
                        json_dict[k]["regions"].pop(str(l))
                if cnt >= 1:
                    choice_dict[k] = json_dict[k]
                else:
                    idx.append(k)
            print("共有{}数据{}张".format(types, (nums - len(idx))))
            print("需要删除的图片序号:\n", idx)
            print("-" * 100)

        with open(json_path, 'w') as f:
            f.write(json.dumps(choice_dict))

        return idx

    @staticmethod
    def get_label(lists):
        """
        提取特定的标签值
        :param lists 从visualize()处获得,保存筛选出的菜品labels
        :return idx_ 返回待删除的图片文件名
        """
        idx_trian = Operate.filter_data(json_path='./train/via_region_data.json', filter_list=lists, types="训练")
        idx_val = Operate.filter_data(json_path='./val/via_region_data.json', filter_list=lists, types="验证")
        idx_test = Operate.filter_data(json_path='./test/via_region_data.json', filter_list=lists, types="测试")

        return idx_trian, idx_val, idx_test

    @staticmethod
    def delete_photo(file_path, file_name_list):
        """
        删除指定序号的图片
        file_name_list 待删除文件名列表,由get_label()处获得输入idx
        """
        files = os.listdir(file_path)
        for i, f in enumerate(files):
            if f[0: 8] in file_name_list:
                os.remove(file_path + f)
                print("Success to delete picture {} !".format(f))

    @staticmethod
    def choose_photo(file_name_list):
        """
        筛选指定序号的图片
        file_name_list 待删除文件名列表,由get_label()处获得输入idx
        """
        files = os.listdir('./train')
        for i, f in enumerate(files):
            if f[0: 8] not in file_name_list:
                shutil.copyfile(MOVE_FILE_PATH+f, OUTPUT_FILE_PATH+f)
                print("Success to copy picture {} to output_file directory!".format(f[0: 8]))

    @staticmethod
    def divide_train_and_val(start, end):
        """
        将数据集划分为训练数据集和测试数据集
        :param start: 起始序号
        :param end: 结尾序号
        """
        via = {}
        with open(OUTPUT_JSON_NAME, 'r', encoding='utf-8') as f:
            json_dict = json.load(f)
            cnt = 1
            for k, val in json_dict.items():
                if (cnt >= start) and (cnt <= end):
                    via[k] = val
                cnt += 1
        with open(TRAIN_VAL_DIVIDE_JSON_NAME, 'w', encoding='utf-8') as f:
            f.write(json.dumps(via, ensure_ascii=False))

    @staticmethod
    def print_name(l, name_dict):
        """输出标签值对应的中文名"""
        ln = []
        for i in range(len(l)):
            ln.append(name_dict[l[i]])
        print("**30**:\n", ln)

    @staticmethod
    def turn_chinese(write_path):
        """
        将标签值映射为中文
        """
        data = pd.read_excel(MENU_EXCEL_NAME, skiprows=1)
        labels = list(data['序号'])
        labels = [str(label).zfill(3) for label in labels]
        names = list(data['菜名'])
        name_dict = {}
        for i in range(len(labels)):
            name_dict[labels[i]] = names[i]

        with open(OUTPUT_JSON_NAME, 'r') as f:
            json_dict = json.load(f)
            for k, val in json_dict.items():
                ls = len(json_dict[k]['regions'])
                for l in range(ls):
                    json_dict[k]['regions'][str(l)]['region_attributes']['label'] = \
                        name_dict[json_dict[k]['regions'][str(l)]['region_attributes']['label']]
        with open(write_path, 'w', encoding="utf-8") as f:
            # json_dict = sorted(zip(json_dict.values(), json_dict.keys()))
            print(json_dict.keys())
            f.write(json.dumps(json_dict, ensure_ascii=False))


def main():
    # modify = Modify()
    # modify.rename_key()
    # modify.rename_file_name()
    # Check().check_label()

    # operate = Operate()
    # operate.shuffler_data()
    # label_name, _ = operate.feature_map()
    # number_per_food_train, number_per_food_val, number_per_food_test, nv, nt = operate.counts(label_name)
    # choose_label = \
    #     operate.visualize(number_per_food_train, number_per_food_val, number_per_food_test, label_name, nv, nt)
    # train_list, val_list, test_list = operate.get_label(choose_label)
    # operate.delete_photo('./train', train_list)
    # operate.delete_photo('./val', val_list)
    # operate.delete_photo('./test', test_list)


    print('Success Running!')


if __name__ == '__main__':
    main()

有问题欢迎添加微信号:cv_huber或扫描关注以下二维码,备注“CSDN”,了解每日AI最新资讯。
在这里插入图片描述

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值