import argparse
import json
import os
from docx import Document
from pptx import Presentation
from win32com import client
class Filter:
def __init__(self, target_file):
self.image_index = 0
self.target_file = target_file
self.output_dir = ''
self.output_json = ''
self.output_data = []
self.suffix = ''
self.set_output()
# 设置输出文件夹、json
def set_output(self):
dot_index = self.target_file.rfind('.')
self.suffix = self.target_file[dot_index:]
self.output_dir = self.target_file[0: dot_index]
if not os.path.exists(self.output_dir):
os.makedirs(self.output_dir)
self.output_json = self.target_file[0: dot_index] + '.json'
def filter_image(self):
if self.suffix not in ['.doc', '.docx', '.ppt', '.pptx']:
print('该文件不可提取图片!文件名:' + self.target_file + '。\n')
return
if self.suffix == '.doc':
self.convertDoc()
if self.suffix == '.ppt':
self.convertPpt()
if self.suffix == '.docx':
self.get_image_docx()
if self.suffix == '.pptx':
self.get_image_pptx()
if self.image_index != 0 and len(self.output_data) != 0:
self.write_json()
def convertDoc(self):
try:
# 创建 Word 应用程序对象
word_app = client.Dispatch('Word.Application')
word_app.DisplayAlerts = False
# 打开 DOC 文件
doc = word_app.Documents.Open(self.target_file)
# 将内容复制到新的 DOCX 文件中
new_docx_file = self.target_file.replace('doc', 'docx')
doc.SaveAs(new_docx_file, 16) # 16 表示保存为 DOCX 格式
doc.Close()
word_app.Quit()
# 转换完毕后再次提取
self.target_file = new_docx_file
self.get_image_docx()
except Exception as e:
print(e)
def convertPpt(self):
try:
# 创建 Word 应用程序对象
ppt_app = client.Dispatch('Powerpoint.Application')
ppt_app.Visible = 1
# 打开 DOC 文件
ppt = ppt_app.Presentations.Open(self.target_file)
# 将内容复制到新的 DOCX 文件中
new_pptx_file = self.target_file.replace('ppt', 'pptx')
ppt.SaveAs(new_pptx_file, 24) # 16 表示保存为 DOCX 格式
ppt.Close()
ppt_app.Quit()
# 转换完毕后再次提取
self.target_file = new_pptx_file
self.get_image_pptx()
except Exception as e:
print(e)
def get_image_docx(self):
# 加载Word文档
doc = Document(self.target_file)
# 遍历Word包中的所有文件
dict_rel = doc.part.rels
# r_id:文件身份码,rel:文件对象
for r_id, rel in dict_rel.items():
if not ( # 如果文件不是在media或者embeddings中的,直接跳过
str(rel.target_ref).startswith('media')
or str(rel.target_ref).startswith('embeddings')
):
continue
# 如果文件不是我们想要的后缀,也直接跳过
file_suffix = str(rel.target_ref).split('.')[-1:][0]
if file_suffix.lower() not in ['png', 'jpg', 'gif', 'jpeg']:
continue
# 构建导出文件的名字和路径
file_suffix = '.' + file_suffix
self.image_index += 1
file_name = str(self.image_index) + file_suffix
file_path = os.path.join(self.output_dir, file_name)
# 将二进制数据写入到新位置的文件中
try:
with open(file_path, "wb") as f:
f.write(rel.target_part.blob)
self.output_data.append(file_path)
f.close()
except Exception as e:
print(e)
def get_image_pptx(self):
prs = Presentation(self.target_file)
for slide in prs.slides:
for obj in slide.shapes:
try:
imdata = obj.image.blob
imagetype = obj.image.content_type
typekey = imagetype.find('/') + 1
imtype = '.' + imagetype[typekey:]
self.image_index += 1
image_file = os.path.join(self.output_dir, str(self.image_index) + imtype)
try:
with open(image_file, "wb") as f:
f.write(imdata)
self.output_data.append(image_file)
f.close()
except Exception as e:
print(e)
except:
pass
def write_json(self):
with open(self.output_json, "w", encoding='utf-8') as f:
json.dump(self.output_data, ensure_ascii=False, indent=4, fp=f)
f.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='set target file')
parser.add_argument('--file', help='文件绝对路径', type=str, required=True)
args = parser.parse_args()
if not os.path.exists(args.file):
print('文件不存在!')
else:
filterClass = Filter(args.file)
filterClass.filter_image()
依赖 python-docx python-pptx