【数据集分类】使用python按照JSON标签对数据集进行分类

最新推荐文章于 2024-04-20 15:12:20 发布

霜狼氏族2000

最新推荐文章于 2024-04-20 15:12:20 发布

阅读量1.5k

点赞数 1

分类专栏： python 深度学习文章标签： python json 分类

本文链接：https://blog.csdn.net/qq_46119062/article/details/129069951

版权

python 同时被 2 个专栏收录

3 篇文章 0 订阅

订阅专栏

深度学习

3 篇文章 0 订阅

订阅专栏

我们所使用的数据集为Istego100K，训练集中包含100000张图像，其中包含33404张用nsf5嵌密的图片，需要将根据数据集提供的标签数据将其提取出来。

标签大致如下：

parameters={
    "000001.jpg":{ # parameters for stego-file    
        "quality": 95,  # quality factor
        "rate": 0.4, # embedding rate (payload)
        "steg_algorithm": "nsf5" # steganographic algorithm
     },
     "000002.jpg":{ # parameters for cover-file
       "quality": 90 # quality factor
     }
}

首先我们需要借助工具将json数据转化为csv文件，代码如下：

import sys
import json
import csv
import xlsxwriter

out = []
arg=""
i=0
verbose=False
#parse input parameters
if len(sys.argv) == 4:
    for i,arg in enumerate(sys.argv):
        if("-v" in arg):
            sys.argv.pop(i)
            verbose = True
    if(verbose):
        fileDir = str(sys.argv[1])
        outFile = str(sys.argv[2])
    else:
        print("Correct usage is : python jsontoexcel.py <Path to json> <Path and output files' base name(optional)>")
        print("Example : python jsontoexcel.py myfile.json")
        print("will output two files: myfile.csv myfile.xlsx\n")
        print("Whereas : python jsontoexcel.py myfile.json output")
        print("will output two files: output.csv output.xlsx")
        print("you can use the -v flag for verbose output")

elif len(sys.argv) == 3:
    for i,arg in enumerate(sys.argv):
        if("-v" in arg):
            sys.argv.pop(i)
            verbose = True
    if(verbose):
        fileDir = str(sys.argv[1])
        outFile = "./" + str(sys.argv[1]).split(".")[0]
    else:
        fileDir = str(sys.argv[1])
        outFile = str(sys.argv[2])

elif len(sys.argv) == 2:
    for i,arg in enumerate(sys.argv):
        if("-v" in arg):
            print("Correct usage is : python jsontoexcel.py <Path to json> <Path and output files' base name(optional)>")
            print("Example : python jsontoexcel.py myfile.json")
            print("will output two files: myfile.csv myfile.xlsx\n")
            print("Whereas : python jsontoexcel.py myfile.json output")
            print("will output two files: output.csv output.xlsx")
            print("you can use the -v flag for verbose output")
            sys.exit(-1)
    fileDir = str(sys.argv[1])
    outFile = "./"+str(sys.argv[1]).split(".")[0]

else:
    print ("Correct usage is : python jsontoexcel.py <Path to json> <Path and output files' base name(optional)>")
    print("Example : python jsontoexcel.py myfile.json")
    print("will output two files: myfile.csv myfile.xlsx\n")
    print("Whereas : python jsontoexcel.py myfile.json output")
    print("will output two files: output.csv output.xlsx")
    print("you can use the -v flag for verbose output")

    sys.exit(-1)

#flattens a tree object consisted of dictionaries and lists
def flatten_json(y):
    print("flattening json file recursivelly")
    list2 = []
    labels = []
    depth = []
    global count
    count = 0

#flatten each row of the root list
    if type(y) is dict:
        for j in y.values() :
            #print(j)
            out,lbl,cnt=flatten(j,' ')
            if verbose:
                print("Sub tree:" + str(out))
            depth.append(cnt)
            labels.append(lbl)
            #print(out)
            list2.append(out)

    elif isinstance(y, list):
        for j in y :
            #print(j)
            out,lbl,cnt=flatten(j,' ')
            if verbose:
                print("Sub tree:" + str(out))
            depth.append(cnt)
            labels.append(lbl)
            #print(out)
            list2.append(out)
    label=[]

    #find max path in json tree
    label.append( max(labels, key=len))
    if verbose:
        print("labels:"+str(label))
    list2 = label + list2
    if verbose:
        print (list2)
    return (list2)

labels = []

#explore a tree with recursion and flatten to list
def flatten(x,name):
    out=[]
    label=[]
    count=0
    if type(x) is dict:
        for a in x:
            tmp,nm,cnt=flatten(x[a], name + a + '/')
            out+=tmp
            label+=nm
            count+=cnt
    elif isinstance(x, list):
        i = 0
        for a in x:
            tmp,nm,cnt=flatten(a, name + str(i) + '/')
            out+=tmp
            label += nm
            count+=cnt
            i += 1
    else:
        count += 1
        out.append(x)
        label.append(name)

    return out,label,count

#open json file
print("Loading json file")
with open(fileDir,encoding = 'utf-8', newline='') as file:
    data = file.read().replace('\n', '')
all_data = json.loads(data)
print(all_data)
global count

#flatten data
flat = flatten_json(all_data)

#create csv with flattened data
print("Saving data as "+outFile+".csv")
data_csv = open(outFile+".csv", 'w',newline='')
csvwriter = csv.writer(data_csv)
data_csv.write('SEP=,\n')
for row in flat :
    csvwriter.writerow(row)


#save data as xlsx
print("Saving data as "+outFile +'.xlsx')
workbook = xlsxwriter.Workbook(outFile +'.xlsx',)
worksheet = workbook.add_worksheet()

bold = workbook.add_format({'bold': True})

for r, row in enumerate(flat):
    for c, col in enumerate(row):
        if r==0:
            worksheet.write(r, c, col, bold)
        else:
            worksheet.write_string(r, c, str(col))

workbook.close()

print("Successfully created files:"+outFile +'.xlsx , ' + outFile+".csv" )

运行命令python JsonToExcel.py <Path to json> <Path and output files names(optional)>

此时生成如下csv文件
在这里插入图片描述

再通过关键字匹配找到steg_algorithm=“nsf5”的图像id，将其对应图像提取出来，代码如下：

import os
import shutil
import pandas as pd
import random

# 打开表格文件并读取
f = open("C:/Users/hp/PycharmProjects/pythonProject2/train.csv", "rb")  # 打开csv文件
list = pd.read_csv(f)  # 这句不能少


listnew = list[list["steg_algorithm"]=="nsf5"]  # 对应csv文件图片那一栏的标题
l = listnew["id"].tolist()  # 对应csv文件标签那一栏的标题)
for each in l:
    j='{:06d}'.format(each)  #将图像编号转换为6位整数，不足补零，与原图像名称保持一致
    print(j)
    shutil.move('D:/实验资源/IStego100K/' + str(j) +'.jpg', 'D:/实验资源/IStego100K/' + '1')
print("完成")