MHT文件图片提取 Python实现

烧火工～

已于 2023-08-15 11:45:36 修改

阅读量591

点赞数 2

文章标签： python

于 2023-08-15 11:36:58 首次发布

本文链接：https://blog.csdn.net/qq_46165837/article/details/132295585

版权

不多说，直接上代码，将mht文件中的图片保存在当前目录，自动压缩。

import base64
import os
import zipfile

FILE_EXIST_AUTO_REWRITE = 1 
IMG_NAME_REWRITE = 1
BASE64_LINE_THRESHOLD = 500


def saveImg(file_path, folder_path):
    img_dic = dict()     # key:img name, value:lines
    skipped_dic = dict() # key:file name, value:encode
    sifted_dic = dict()  # key:img name, value:lines     
    with open(file_path, 'rb') as file:
        line = file.readline().decode()

        while line != '':
            # seek to the header of base64 code
            line = file.readline().decode()
            if 'Boundary' in line and line[0] == '-':
                content_type = file.readline().decode().replace(':', '').replace('/', ' ').split() # Content-Type
                content_encode = file.readline().decode().replace(':', '').replace('/', ' ').split() # Content-Location
                content_location = file.readline().decode().replace(':', '').replace('/', ' ').split() # Content-Transfer-Encoding
                if content_type == [] or content_encode == [] or content_location == []:
                    continue
                file_type = content_type[-1]
                file_encode = content_encode[-1]
                file_name = content_location[-1].split('?')[0]
                img_name = ''

                # file type is image
                if content_type[1] == 'image' and file_encode == 'base64':
                    line = file.readline()                            # blank line after Content-Location
                    line = file.readline().decode().replace('\n', '') # first line of base64
                    base64_str = line 
                    lines = 0  
                    while True:  
                        line = file.readline().decode()  
                        lines += 1
                        base64_str += line.replace('\n', '')
                        if len(line) <= 2: # blank line after base64 code has 2 bytes
                            break               
                    # img is too small
                    if lines <= BASE64_LINE_THRESHOLD:
                        sifted_dic[file_name] = lines
                    # img fit the threshold
                    else:      
                        if IMG_NAME_REWRITE == 1:
                            img_name = str(len(img_dic)+1).zfill(4) + '.' + file_type  # fill leading_zero
                        elif IMG_NAME_REWRITE == 0:
                            img_name = file_name.split('.')[0] + '.' + file_type                           
                        img_dic[img_name] = lines
                        base64_decode = base64.b64decode(base64_str)    
                        img = open(folder_path + '/' + img_name, "wb")
                        img.write(base64_decode)
                        img.close()
                # file type is not image
                else:
                    skipped_dic[file_name] = file_encode
    # reach the end of file
    print('[Saved Img] %d'%(len(img_dic)))
    # print('[Saved Img] [name:lines]: \n', img_dic)
    # print('[Sifted Img] [name:lines]: \n', sifted_dic)
    # print('Skipped [name:encode]: ', skipped_dic)


def getDirSize(dir):
   size = 0
   for root, dirs, files in os.walk(dir):
      size += sum([os.path.getsize(os.path.join(root, name)) for name in files])
   return size


def zipImg(folder_path, folder_name):
    zip_name = folder_name+'.zip'
    zip = zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED)   
    for file in os.listdir(folder_path):
        zip.write(folder_path + os.sep + file, file)
    zip.close()
    dir_size = getDirSize(folder_path)
    zip_size = os.path.getsize(zip_name)
    print(r'[Raw/Compressed] %.2f / %.2f MB'%(dir_size/1024/1024, zip_size/1024/1024))


def extractFile(FILE_NAME):
    CUR_PATH = os.getcwd()
    cur_file = os.path.join(CUR_PATH, FILE_NAME)
    # file exist
    if(os.path.isfile(cur_file)):
        print("[File Name]", FILE_NAME)
        print("[File Path]", CUR_PATH)
        folder_name = '.'.join(FILE_NAME.split('.')[0:-1])
        folder_path = os.path.join(CUR_PATH, folder_name)
        # folder exist
        if os.path.exists(folder_path):   
            if FILE_EXIST_AUTO_REWRITE == 1:
                pass
            elif FILE_EXIST_AUTO_REWRITE == 0:
                print("Folder Exist:", folder_name, "Rewrite It? [y/n]", end=" ")
                confirm = input()
                if confirm == 'y':
                    pass
                elif confirm == 'n':
                    return
        # folder not exist, create it
        else:
            os.makedirs(os.path.join(CUR_PATH, folder_name))
            print("[New Folder]", folder_name)
        saveImg(cur_file, folder_path)
        zipImg(folder_path, folder_name)   
    #file not exist, exit
    else:
        print("No This File!")
        return


def getTargetFile():
    file_list = os.listdir(os.getcwd())
    target_file_list = []
    for file in file_list:
        if file.split('.')[-1] == 'mht':
            target_file_list.append(file)
    for file in target_file_list:
        print('[Find File]', file)
    print('[File Nums]', len(target_file_list))
    return target_file_list


if __name__ == '__main__':
    target_file_list = getTargetFile()
    for index, file in enumerate(target_file_list):
        print("\n------------------------------------------PROCESS [{}/{}]-----------------------------------".format(index+1, len(target_file_list)))
        extractFile(file)