说明:读取数据的名字,然后根据数据的名字进行分类,还涉及到图像的名字在csv文件中的查找,用到的有glob,os,会有就是处理数据的pandas工具。
import glob
import os
import pandas as pd
import shutil
import os.path as op
#不动原来的文件,避免数据混乱
#只需要在原来的文件中创建三个文件
#失败的
#封面
#不是封面
#分别放进去就可以了
def main():
file_path = 'mnt'
image_paths = glob.glob(os.path.join(file_path, '*/*/*/*/*/*/*/*/*.jpg'))
image_file = glob.glob(os.path.join(file_path, '*/*/*/*/*/*/*/*'))
#generate_file(image_file)
'''对应的文件夹下生成书,内页和没有书的文件夹,如果生成了,那么此句不运行'''
match = pd.read_csv('templates.csv', names=['pathid', 'imageid', 'bookid', 'group', 'isdel'])
match_df = pd.DataFrame(match)
split_file(image_paths, match_df)
'''
读取对应的csv文件,然后转化到dataframe下,接着进行图像分类
图像分类:
success : imageid :(group:1 封面,group:other numbers 内页)
fail: without_imageid :(-1:检测不是书的, 不是-1:什么都不是,系统问题的数据)
'''
def generate_file(image_paths):
names = ['cover','inside_pages','nobooks']
for image_file in image_paths:
for n in names:
generate_path = os.path.join(image_file,n)
os.mkdir(generate_path)
def split_file(paths,match):
names = ['cover', 'inside_pages', 'nobooks']
for image_name in paths:
imageid_i = image_name.split('_')[3]
files = image_name.split('/')
if imageid_i == '-1' and files[-2] != names[2]: #检测不是书的
try:
new_path = os.path.join(image_name.replace(op.basename(image_name),names[2])) #不用strip容易出bug
shutil.move(image_name,new_path)
except FileNotFoundError as e:
print(image_name)
else:
location = match[(match.imageid == imageid_i)].index.values
try:
if match.at[int(location), 'group'] == 1 and files[-2] != names[0]: #检测出来封面的
new_path = os.path.join(image_name.replace(op.basename(image_name), names[0]))
shutil.move(image_name, new_path)
#print(new_path)
#print(image_name)
elif match.at[int(location), 'group'] != 1 and files[-2] != names[1]: #大部分是内页的
new_path = os.path.join(image_name.replace(op.basename(image_name), names[1]))
#print(new_path)
#print(image_name)
shutil.move(image_name, new_path)
except TypeError as e:
pass
#print(e)
#print(image_name)
if __name__ == '__main__':
main()