以下是项目过程中遇到的一些数据清洗,预处理和统计等常规操作,有需要的可以自行借鉴下相关模块代码,转载或引用请注明。
###################################################################################
# -.- coding:utf-8 -.-
# __author__ = 'Jack'
# __date__ = 2019/07/09
# 目录下所需文件:
# 1- menu.xlsx: 存放菜单的Id、Name、Price三个属性
# 2- origin.json: 存放原始标注图片的标注信息
# 3- modify.json: 待检查的json文件
# 操作文档说明:
# 1- 拿到文件先抽样检查,确保整体无误,跟新下menu,xlsx
# 2- 将新增图片放入到not_rename_file文件夹下
# 3- 将相应的json文件命名为modify.json放到与data_process.py文件同级的目录下
# 4- 修改相应的超参数设置(INDEX_START),运行以下程序:
# modify = Modify()
# modify.rename_key() # 修改json文件相应属性名
# modify.rename_file_name() # 修改图片文件名
# # modify.remark_label() # 如需修改标签值则调用此函数
# Check().check_label()
# 5- 运行结束后检查origin.json是否有新增标注数据信息,若出现错误,则根据提示进行修正再次重新运行
# 6- 重复步骤1-5,直至将所有批次的新增数据处理完毕,跳转到7-
# 7- 在主程序中将第4-调用到的所有程序注释掉,确保不被运行到
# 8- 修改相应的超参数(MORE_THAN_NUM),并运行以下程序:
# operate = Operate()
# operate.shuffler_data()
# label_name, _ = operate.feature_map()
# number_per_food_train, number_per_food_val, number_per_food_test, nv, nt = operate.counts(label_name)
# choose_label = \
# operate.visualize(number_per_food_train, number_per_food_val, number_per_food_test, label_name, nv, nt)
# train_list, val_list, test_list = operate.get_label(choose_label)
# operate.delete_photo('./train', train_list)
# operate.delete_photo('./val', val_list)
# operate.delete_photo('./test', test_list)
# 执行完以上程序需记录“样本数超过?的标签值有?种,即[...]”这两个数据(外加中文标签),并把train/vak/test更新
# 训练程序配置
# (1)在caipinshibie.py文件搜索changes修改相应参数
# (2)在mrcnn文件夹下的model.py文件定位到1711行修改相应的augmentation为None或True
# (3)运行训练文件
###################################################################################
import json
import xmltodict
import pandas as pd
from collections import OrderedDict
import matplotlib.pyplot as plt
import os
import shutil
import datetime
import warnings
# 系统参数设置
plt.rcParams['font.sans-serif'] = ['SimHei'] # 设置中文乱码格式
plt.rcParams['axes.unicode_minus'] = False # 设置负号正常显示
warnings.filterwarnings("ignore") # 忽略警告
# 超参数设置
MODIFY_JSON_NAME = 'modify.json' # 待检查的json文件
ORIGIN_JSON_NAME = 'origin.json' # 已有的json文件
OUTPUT_JSON_NAME = 'output.json' # 经筛选所得到的json文件
TRAIN_VAL_DIVIDE_JSON_NAME = 'via_region_data.json' # 划分训练测试集所得的json文件
MENU_EXCEL_NAME = 'menu.xlsx' # 菜单,须实时更新
MODIFY_FILE_PATH = './not_rename_file/' # 待修改文件名图片的存放路径
MOVE_FILE_PATH = './json_file/' # 此文件夹存放已经处理好的图片
OUTPUT_FILE_PATH = './output_file/' # 此文件夹存放经过筛选后所得图片
INDEX_START = 955 # 新增图片的起始序号
MORE_THAN_NUM = 70 # 筛选出出现次数大于多少次的菜品
class Modify(object):
"""用于重命名文件或属性"""
##############################################################################
# rename_file_name():批量修改文件名
# rename_key():重命名 json 文件中的键名和 filename 属性值
# remark_label():修改标签值
##############################################################################
@staticmethod
def rename_file_name():
files = os.listdir(MODIFY_FILE_PATH)
cnt = INDEX_START
for file in files:
old_name = file
old_path = os.path.join(MODIFY_FILE_PATH, old_name)
new_name = 'IMG_' + str(cnt).zfill(4) + '.jpg'
new_path = os.path.join(MODIFY_FILE_PATH, new_name)
os.rename(old_path, new_path)
print(old_name, "========>", new_name)
cnt += 1
print("Successful to rename filename!")
@staticmethod
def rename_key():
"""重命名json,格式为“IMG_0001”"""
re_dict = {}
cnt = INDEX_START
with open(MODIFY_JSON_NAME, 'r') as f:
json_dict = json.load(f)
for k, val in json_dict.items():
rename = "IMG_" + str(cnt).zfill(4) + '.jpg'
re_dict[rename] = json_dict[k]
re_dict[rename]['filename'] = rename
cnt += 1
with open(MODIFY_JSON_NAME, 'w') as f:
f.write(json.dumps(re_dict))
@staticmethod
def remark_label():
"""修改标签值"""
with open(ORIGIN_JSON_NAME, 'r') as f:
# json_dict = json.load(f, object_pairs_hook=OrderedDict) # 使用有序字典
json_dict = json.load(f)
print("修改前的字典顺序:\n", json_dict.keys())
for k, val in json_dict.items():
ls = len(json_dict[k]['regions'])
for l in range(ls):
if json_dict[k]['regions'][str(l)]['region_attributes']['label'] == '1315':
json_dict[k]['regions'][str(l)]['region_attributes']['label'] = '0812'
print(k[4: 8])
with open(ORIGIN_JSON_NAME, 'w') as f:
# json_dict = sorted(zip(json_dict.values(), json_dict.keys()))
print("修改后的字典顺序:\n", json_dict.keys())
f.write(json.dumps(json_dict))
class Check(object):
"""用于检测标注是否正确,若正确则合并到原始json文件当中"""
#######################################################################################
# move_file():将符合条件的图片剪切到总的图片文件夹下
# check_label():检查标签值是否满足三位的格式及在指定的范围内
#######################################################################################
@staticmethod
def move_file():
"""将符合条件的图片剪切到总的图片文件夹下"""
files = os.listdir(MODIFY_FILE_PATH)
for file in files:
shutil.move(MODIFY_FILE_PATH+file, MOVE_FILE_PATH)
@staticmethod
def check_label():
"""检查标签值是否满足三位的格式及在指定的范围内"""
modifies = []
data = pd.read_excel('menu.xlsx', skiprows=1)
max_label = max(data['Id'])
with open(MODIFY_JSON_NAME, 'r') as f:
json_dict = json.load(f)
for k, val in json_dict.items():
ls = len(json_dict[k]["regions"])
for l in range(ls):
try:
if (len(json_dict[k]['regions'][str(l)]['region_attributes']['label']) != 3) or \
(int(json_dict[k]['regions'][str(l)]['region_attributes']['label']) < 1) or \
(int(json_dict[k]['regions'][str(l)]['region_attributes']['label']) > max_label):
modifies.append(k[4: 8] + str(l))
except KeyError:
modifies.append(k[4: 8] + '_' + str(l))
if len(modifies) == 0:
print("所有图片均符合要求!")
with open(ORIGIN_JSON_NAME, 'r') as o:
json_origin = json.load(o)
with open(MODIFY_JSON_NAME, 'r') as m:
json_modify = json.load(m)
for k, val in json_modify.items():
json_origin[k] = val
with open(ORIGIN_JSON_NAME, 'w') as f:
f.write(json.dumps(json_origin))
Check.move_file()
else:
print("需要修正的图片序号为:\n", modifies)
@staticmethod
def look_index(idx):
# 寻找包含特定编号的图片,返回一个列表
indexs = []
with open(ORIGIN_JSON_NAME, 'r') as f:
json_dict = json.load(f)
for k, val in json_dict.items():
ls = len(json_dict[k]["regions"])
for l in range(ls):
if json_dict[k]['regions'][str(l)]['region_attributes']['label'] == idx:
indexs.append(k[4: 8])
print("包含标签值为{}的图片编号为:".format(idx))
print(indexs)
class Operate(object):
"""对数据进行一些操作"""
#######################################################################################
# shuffler_data():将原始数据集按8:2的操作划分为训练数据集和测试数据集
# feature_map():返回两个字典:{label: name}和{label: price}
# counts(name_dict):统计已有菜品的种类数,以及每种菜品出现的次数
# visualize(var_dict, name_dict):可视化统计结果,返回筛选出来的菜品标签值
# get_label(lists):提取特定的标签值
# delete_photo(paths, file_name_list):删除指定序号的图片
# choose(paths, file_name_list):筛选指定序号的图片
# divide_train_and_val(start, end):将数据集划分为训练数据集和测试数据集
# print_name(l, name_dict):输出标签值对应的中文名"
#######################################################################################
@staticmethod
def shuffler_data():
train_data = {}
validation_data = {}
test_data = {}
with open(ORIGIN_JSON_NAME, 'r', encoding='utf-8') as f:
json_dict = json.load(f)
cnt = 1
for k, val in json_dict.items():
if cnt % 10 == 3:
validation_data[k] = val
elif cnt % 10 == 7:
test_data[k] = val
else:
train_data[k] = val
cnt = cnt + 1
with open('./train/via_region_data.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(train_data, ensure_ascii=False))
with open('./val/via_region_data.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(validation_data, ensure_ascii=False))
with open('./test/via_region_data.json', 'w', encoding='utf-8') as f:
f.write(json.dumps(test_data, ensure_ascii=False))
cnt = 1
files = os.listdir(MOVE_FILE_PATH)
for file in files:
if cnt % 10 == 3:
shutil.copy(MOVE_FILE_PATH+file, './val/')
elif cnt % 10 == 7:
shutil.copy(MOVE_FILE_PATH+file, './test/')
else:
shutil.copy(MOVE_FILE_PATH+file, './train/')
cnt = cnt + 1
@staticmethod
def feature_map():
"""
read_path: Excel文件路径
:return: 映射字典
"""
data = pd.read_excel(MENU_EXCEL_NAME, skiprows=1)
labels = list(data['Id'])
labels = [str(label).zfill(3) for label in labels] # 标签值
names = list(data['Name']) # 菜品名
prices = list(data['Price']) # 价格
print("总标签值:\n", labels)
print("总菜品名:\n", names)
print("价格:\n", prices)
print("-" * 200)
na_dict = {} # 每个标签值对应的菜品名
pri_map = {} # 每个标签值对应的菜品价格
for i in range(len(labels)):
na_dict[labels[i]] = names[i]
pri_map[labels[i]] = prices[i]
return na_dict, pri_map
@staticmethod
def counts(name_dict):
"""
统计已有菜品的种类数,以及每种菜品出现的次数
read_path: 文件路径
:param name_dict: 从feature_map的输出获得,菜品名映射字典
:return
exist_labels 目前拥有的菜品种类
num_per_food 每种菜品对应的数量
"""
with open('./train/via_region_data.json', 'r') as f:
json_dict = json.load(f, object_pairs_hook=OrderedDict)
total_train = []
for k, val in json_dict.items():
ls = len(json_dict[k]['regions'])
for l in range(ls):
total_train.append(json_dict[k]['regions'][str(l)]['region_attributes']['label'])
with open('./val/via_region_data.json', 'r') as f:
json_dict = json.load(f, object_pairs_hook=OrderedDict)
total_val = []
for k, val in json_dict.items():
ls = len(json_dict[k]['regions'])
for l in range(ls):
total_val.append(json_dict[k]['regions'][str(l)]['region_attributes']['label'])
with open('./test/via_region_data.json', 'r') as f:
json_dict = json.load(f, object_pairs_hook=OrderedDict)
total_test = []
for k, val in json_dict.items():
ls = len(json_dict[k]['regions'])
for l in range(ls):
total_test.append(json_dict[k]['regions'][str(l)]['region_attributes']['label'])
value_cut_train = {}
for t in total_train:
value_cut_train[t] = value_cut_train.get(t, 0) + 1
value_cut_val = {}
for t in total_val:
value_cut_val[t] = value_cut_val.get(t, 0) + 1
value_cut_test = {}
for t in total_test:
value_cut_test[t] = value_cut_test.get(t, 0) + 1
num_per_food_train = sorted(value_cut_train.items(), key=lambda v: v[1], reverse=False)
num_per_food_val = sorted(value_cut_val.items(), key=lambda v: v[1], reverse=False)
num_per_food_test = sorted(value_cut_test.items(), key=lambda v: v[1], reverse=False)
print('num_per_food_train:\n', num_per_food_train) # 训练数据集中每种菜品对应的数量
print('num_per_food_val:\n', num_per_food_val) # 训练数据集中每种菜品对应的数量
print('num_per_food_test:\n', num_per_food_test) # 训练数据集中每种菜品对应的数量
print('-' * 100)
exist_labels_train = [v for v in value_cut_train.keys()]
exist_labels_val = [v for v in value_cut_val.keys()]
exist_labels_test = [v for v in value_cut_test.keys()]
print("exist_labels_train:\n", exist_labels_train)
print("exist_labels_val:\n", exist_labels_val)
print("exist_labels_test:\n", exist_labels_test)
print('-' * 100)
chinese_dict_train = {}
for i in range(len(exist_labels_train)):
chinese_dict_train[i] = name_dict[exist_labels_train[i]]
names_train = list(chinese_dict_train.values())
names_train.insert(0, '背景')
print("训练数据集中已有的菜品种类(含背景):\n", names_train)
print("训练数据集中总的菜品数量为: ", len(exist_labels_train))
print('-' * 100)
chinese_dict_val = {}
for i in range(len(exist_labels_val)):
chinese_dict_val[i] = name_dict[exist_labels_val[i]]
names_val = list(chinese_dict_val.values())
print("验证数据集中已有的菜品种类:\n", names_val)
print("验证数据集中总的菜品数量为: ", len(exist_labels_val))
chinese_dict_test = {}
for i in range(len(exist_labels_test)):
chinese_dict_test[i] = name_dict[exist_labels_test[i]]
names_test = list(chinese_dict_test.values())
print("测试数据集中已有的菜品种类:\n", names_test)
print("测试数据集中总的菜品数量为: ", len(exist_labels_test))
return num_per_food_train, num_per_food_val, num_per_food_test, names_val, names_test
@staticmethod
def visualize(train_dict, val_dict, test_dict, name_dict, nv, nt):
"""
数据可视化
var_dict: 从counts()的输出获得,每种菜品对应的数量num_per_food
name_dic: 从feature_map()的输出获得,菜品名映射字典
:return temp: 筛选出来的菜品标签值,用一个列表保存
"""
# train
keys, values, labels, names = [], [], [], ['背景']
for i in range(len(train_dict)):
if train_dict[i][1] >= MORE_THAN_NUM:
labels.append(train_dict[i][0])
names.append(name_dict[train_dict[i][0]])
keys.append(name_dict[train_dict[i][0]])
values.append(train_dict[i][1])
print('训练数据集中样本数超过' + str(MORE_THAN_NUM) + '的标签值有{}种,即\n{}'.format(len(labels), labels))
print("对应的中文标签为:", names)
list_nv, list_nt = [], []
for n in names:
if n not in nv and n != '背景':
list_nv.append(n)
if n not in nt and n != '背景':
list_nt.append(n)
if len(list_nv) != 0:
print("验证集中缺少目标菜品有:", list_nv)
else:
print("验证集中覆盖了所有的目标")
if len(list_nt) != 0:
print("测试集中缺少目标菜品有:", list_nt)
else:
print("测试集中覆盖了所有的目标")
plt.figure(figsize=(12, 12))
# plt.subplots_adjust(left=0.09, right=1, wspace=0.25, hspace=0.25, bottom=0.13, top=0.91)
plt.barh(keys, values, color='steelblue', alpha=0.8)
plt.yticks(fontsize=5)
plt.title(str(datetime.date.today()) + u"训练数据菜品数量分布图")
plt.ylabel(u'菜品种类')
plt.xlabel(u'菜品数量')
plt.tight_layout()
for x, y in enumerate(values):
plt.text(y + 0.3, x - 0.3, '%s' % y, fontsize=8)
plt.savefig('./statistics_graph/' + str(datetime.date.today()) + u"训练数据菜品数量分布图" + '.png')
plt.show()
# val
keys, values = [], []
for i in range(len(val_dict)):
keys.append(name_dict[val_dict[i][0]])
values.append(val_dict[i][1])
plt.figure(figsize=(12, 12))
plt.barh(keys, values, color='steelblue', alpha=0.8)
plt.yticks(fontsize=5)
plt.title(str(datetime.date.today()) + u"验证数据菜品数量分布图")
plt.ylabel(u'菜品种类')
plt.xlabel(u'菜品数量')
plt.tight_layout()
for x, y in enumerate(values):
plt.text(y + 0.3, x - 0.3, '%s' % y, fontsize=8)
plt.savefig('./statistics_graph/' + str(datetime.date.today()) + u"验证数据菜品数量分布图" + '.png')
plt.show()
# test
keys, values = [], []
for i in range(len(test_dict)):
keys.append(name_dict[test_dict[i][0]])
values.append(test_dict[i][1])
plt.figure(figsize=(12, 12))
plt.barh(keys, values, color='steelblue', alpha=0.8)
plt.yticks(fontsize=5)
plt.title(str(datetime.date.today()) + u"测试数据菜品数量分布图")
plt.ylabel(u'菜品种类')
plt.xlabel(u'菜品数量')
plt.tight_layout()
for x, y in enumerate(values):
plt.text(y + 0.3, x - 0.3, '%s' % y, fontsize=8)
plt.savefig('./statistics_graph/' + str(datetime.date.today()) + u"测试数据菜品数量分布图" + '.png')
plt.show()
return labels
@staticmethod
def filter_data(json_path, filter_list, types):
choice_dict = {}
idx = []
with open(json_path, 'r') as f:
json_dict = json.load(f)
nums = len(json_dict)
for k, val in json_dict.items():
cnt = 0 # 统计出现目标标签的次数
ls = len(json_dict[k]["regions"])
for l in range(ls):
if json_dict[k]["regions"][str(l)]["region_attributes"]["label"] in filter_list:
cnt += 1
else:
json_dict[k]["regions"].pop(str(l))
if cnt >= 1:
choice_dict[k] = json_dict[k]
else:
idx.append(k)
print("共有{}数据{}张".format(types, (nums - len(idx))))
print("需要删除的图片序号:\n", idx)
print("-" * 100)
with open(json_path, 'w') as f:
f.write(json.dumps(choice_dict))
return idx
@staticmethod
def get_label(lists):
"""
提取特定的标签值
:param lists 从visualize()处获得,保存筛选出的菜品labels
:return idx_ 返回待删除的图片文件名
"""
idx_trian = Operate.filter_data(json_path='./train/via_region_data.json', filter_list=lists, types="训练")
idx_val = Operate.filter_data(json_path='./val/via_region_data.json', filter_list=lists, types="验证")
idx_test = Operate.filter_data(json_path='./test/via_region_data.json', filter_list=lists, types="测试")
return idx_trian, idx_val, idx_test
@staticmethod
def delete_photo(file_path, file_name_list):
"""
删除指定序号的图片
file_name_list 待删除文件名列表,由get_label()处获得输入idx
"""
files = os.listdir(file_path)
for i, f in enumerate(files):
if f[0: 8] in file_name_list:
os.remove(file_path + f)
print("Success to delete picture {} !".format(f))
@staticmethod
def choose_photo(file_name_list):
"""
筛选指定序号的图片
file_name_list 待删除文件名列表,由get_label()处获得输入idx
"""
files = os.listdir('./train')
for i, f in enumerate(files):
if f[0: 8] not in file_name_list:
shutil.copyfile(MOVE_FILE_PATH+f, OUTPUT_FILE_PATH+f)
print("Success to copy picture {} to output_file directory!".format(f[0: 8]))
@staticmethod
def divide_train_and_val(start, end):
"""
将数据集划分为训练数据集和测试数据集
:param start: 起始序号
:param end: 结尾序号
"""
via = {}
with open(OUTPUT_JSON_NAME, 'r', encoding='utf-8') as f:
json_dict = json.load(f)
cnt = 1
for k, val in json_dict.items():
if (cnt >= start) and (cnt <= end):
via[k] = val
cnt += 1
with open(TRAIN_VAL_DIVIDE_JSON_NAME, 'w', encoding='utf-8') as f:
f.write(json.dumps(via, ensure_ascii=False))
@staticmethod
def print_name(l, name_dict):
"""输出标签值对应的中文名"""
ln = []
for i in range(len(l)):
ln.append(name_dict[l[i]])
print("**30**:\n", ln)
@staticmethod
def turn_chinese(write_path):
"""
将标签值映射为中文
"""
data = pd.read_excel(MENU_EXCEL_NAME, skiprows=1)
labels = list(data['序号'])
labels = [str(label).zfill(3) for label in labels]
names = list(data['菜名'])
name_dict = {}
for i in range(len(labels)):
name_dict[labels[i]] = names[i]
with open(OUTPUT_JSON_NAME, 'r') as f:
json_dict = json.load(f)
for k, val in json_dict.items():
ls = len(json_dict[k]['regions'])
for l in range(ls):
json_dict[k]['regions'][str(l)]['region_attributes']['label'] = \
name_dict[json_dict[k]['regions'][str(l)]['region_attributes']['label']]
with open(write_path, 'w', encoding="utf-8") as f:
# json_dict = sorted(zip(json_dict.values(), json_dict.keys()))
print(json_dict.keys())
f.write(json.dumps(json_dict, ensure_ascii=False))
def main():
# modify = Modify()
# modify.rename_key()
# modify.rename_file_name()
# Check().check_label()
# operate = Operate()
# operate.shuffler_data()
# label_name, _ = operate.feature_map()
# number_per_food_train, number_per_food_val, number_per_food_test, nv, nt = operate.counts(label_name)
# choose_label = \
# operate.visualize(number_per_food_train, number_per_food_val, number_per_food_test, label_name, nv, nt)
# train_list, val_list, test_list = operate.get_label(choose_label)
# operate.delete_photo('./train', train_list)
# operate.delete_photo('./val', val_list)
# operate.delete_photo('./test', test_list)
print('Success Running!')
if __name__ == '__main__':
main()
有问题欢迎添加微信号:cv_huber或扫描关注以下二维码,备注“CSDN”,了解每日AI最新资讯。