win系统提取doc、docx、ppt、pptx中的图片，ppt转pptx,doc转docx

最新推荐文章于 2024-08-30 11:37:19 发布

To无忧

最新推荐文章于 2024-08-30 11:37:19 发布

阅读量188

点赞数

文章标签： powerpoint python 开发语言

本文链接：https://blog.csdn.net/qq_46274532/article/details/131742499

版权

import argparse
import json
import os
from docx import Document
from pptx import Presentation
from win32com import client


class Filter:
    def __init__(self, target_file):
        self.image_index = 0
        self.target_file = target_file
        self.output_dir = ''
        self.output_json = ''
        self.output_data = []
        self.suffix = ''
        self.set_output()

    # 设置输出文件夹、json
    def set_output(self):
        dot_index = self.target_file.rfind('.')
        self.suffix = self.target_file[dot_index:]

        self.output_dir = self.target_file[0: dot_index]
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        self.output_json = self.target_file[0: dot_index] + '.json'

    def filter_image(self):
        if self.suffix not in ['.doc', '.docx', '.ppt', '.pptx']:
            print('该文件不可提取图片！文件名:' + self.target_file + '。\n')
            return

        if self.suffix == '.doc':
            self.convertDoc()
        if self.suffix == '.ppt':
            self.convertPpt()

        if self.suffix == '.docx':
            self.get_image_docx()
        if self.suffix == '.pptx':
            self.get_image_pptx()

        if self.image_index != 0 and len(self.output_data) != 0:
            self.write_json()

    def convertDoc(self):
        try:
            # 创建 Word 应用程序对象
            word_app = client.Dispatch('Word.Application')
            word_app.DisplayAlerts = False
            # 打开 DOC 文件
            doc = word_app.Documents.Open(self.target_file)
            # 将内容复制到新的 DOCX 文件中
            new_docx_file = self.target_file.replace('doc', 'docx')
            doc.SaveAs(new_docx_file, 16)  # 16 表示保存为 DOCX 格式
            doc.Close()
            word_app.Quit()
            # 转换完毕后再次提取
            self.target_file = new_docx_file
            self.get_image_docx()
        except Exception as e:
            print(e)

    def convertPpt(self):
        try:
            # 创建 Word 应用程序对象
            ppt_app = client.Dispatch('Powerpoint.Application')
            ppt_app.Visible = 1
            # 打开 DOC 文件
            ppt = ppt_app.Presentations.Open(self.target_file)
            # 将内容复制到新的 DOCX 文件中
            new_pptx_file = self.target_file.replace('ppt', 'pptx')
            ppt.SaveAs(new_pptx_file, 24)  # 16 表示保存为 DOCX 格式
            ppt.Close()
            ppt_app.Quit()
            # 转换完毕后再次提取
            self.target_file = new_pptx_file
            self.get_image_pptx()
        except Exception as e:
            print(e)

    def get_image_docx(self):
        # 加载Word文档
        doc = Document(self.target_file)
        # 遍历Word包中的所有文件
        dict_rel = doc.part.rels
        # r_id：文件身份码，rel：文件对象
        for r_id, rel in dict_rel.items():
            if not (  # 如果文件不是在media或者embeddings中的，直接跳过
                    str(rel.target_ref).startswith('media')
                    or str(rel.target_ref).startswith('embeddings')
            ):
                continue

            # 如果文件不是我们想要的后缀，也直接跳过
            file_suffix = str(rel.target_ref).split('.')[-1:][0]
            if file_suffix.lower() not in ['png', 'jpg', 'gif', 'jpeg']:
                continue

            # 构建导出文件的名字和路径
            file_suffix = '.' + file_suffix
            self.image_index += 1
            file_name = str(self.image_index) + file_suffix
            file_path = os.path.join(self.output_dir, file_name)

            # 将二进制数据写入到新位置的文件中
            try:
                with open(file_path, "wb") as f:
                    f.write(rel.target_part.blob)
                    self.output_data.append(file_path)
                    f.close()
            except Exception as e:
                print(e)

    def get_image_pptx(self):
        prs = Presentation(self.target_file)
        for slide in prs.slides:
            for obj in slide.shapes:
                try:
                    imdata = obj.image.blob
                    imagetype = obj.image.content_type
                    typekey = imagetype.find('/') + 1
                    imtype = '.' + imagetype[typekey:]
                    self.image_index += 1
                    image_file = os.path.join(self.output_dir, str(self.image_index) + imtype)
                    try:
                        with open(image_file, "wb") as f:
                            f.write(imdata)
                            self.output_data.append(image_file)
                            f.close()
                    except Exception as e:
                        print(e)
                except:
                    pass

    def write_json(self):
        with open(self.output_json, "w", encoding='utf-8') as f:
            json.dump(self.output_data, ensure_ascii=False, indent=4, fp=f)
            f.close()


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='set target file')
    parser.add_argument('--file', help='文件绝对路径', type=str, required=True)

    args = parser.parse_args()

    if not os.path.exists(args.file):
        print('文件不存在!')
    else:
        filterClass = Filter(args.file)
        filterClass.filter_image()