python3 word表格中提取图片（w:tr,w:tc）

海哥0998

已于 2022-09-26 18:35:52 修改

阅读量578

点赞数 2

文章标签： python

于 2022-09-26 18:33:55 首次发布

@hayratjan

本文链接：https://blog.csdn.net/hayratjanmmmm/article/details/127058328

版权

python3 word表格中提取图片（w:tr,w:tc）

用户需求

图片展示：
在这里插入图片描述
另外一个是：

直接上代码

# 日期：2022-8-16
# 时间：14:38
# 用户：海依拉提江
# 项目名称：8-11
# 文件名称：csdn放入不同的文件夹
# encoding=utf-8
import os
import zipfile
import xml
from xml.dom.minidom import parse
import shutil  # 引入os（文件及目录操作）、zipfile（zip文件操作）、shutil(拷贝文件)库

import docx
from win32com import client


def pic2dir(doc_dir, file):
    file_zip = file[:-5] + '.ZIP'
    os.rename(file, file_zip)  # 重命名为zip文件

    tmp_path = doc_dir + 'tmp'
    f = zipfile.ZipFile(file_zip, 'r')
    for img_file in f.namelist():
        if "word" in img_file:
            # 将压缩包里的word文件夹解压出来
            f.extract(img_file, tmp_path)
    f.close()

    os.rename(file_zip, file)

    media_xml = os.path.join(tmp_path, 'word/_rels/document.xml.rels')
    xml_data = parse(media_xml)
    data = xml_data.documentElement
    Relationships = data.getElementsByTagName('Relationship')
    img_dict = {}
    for Relationship in Relationships:
        id = Relationship.getAttribute('Id')
        target = Relationship.getAttribute('Target').replace('media/', '')

        img_dict[id] = [target]

    # print(img_dict)

    docum = os.path.join(tmp_path, 'word/document.xml')

    xml_data = parse(docum)
    print(xml_data)
    data = xml_data.documentElement
    trs = data.getElementsByTagName('w:tc')
    # print(tables)
    img_data = []
    for i, tr in enumerate(trs):
        try:

            # print(tr.getElementsByTagName('a:blip')[0].getAttribute('r:embed'))
            rids = tr.getElementsByTagName('a:blip')
            for rid in rids:
                embed = rid.getAttribute('r:embed')
                pic_path = doc_dir + str(i)
                if not os.path.exists(pic_path):
                    os.mkdir(pic_path)
                shutil.copy(os.path.join(tmp_path + '/word/media', str(
                    img_dict[embed][0])), os.path.join(pic_path, str(img_dict[embed][0])))
                print(embed, i)
        except Exception as E:
            print(E)

    for i in os.listdir(tmp_path):
        # 如果是文件夹则删除
        if os.path.isdir(os.path.join(tmp_path, i)):
            shutil.rmtree(os.path.join(tmp_path, i))

    return 1


def doc2docx(doc_name):
    try:
        # 首先将doc转换成docx
        word = client.Dispatch("Word.Application")
        doc = word.Documents.Open(doc_name)
        docx_name = doc_name[:-4] + '.docx'
        # 使用参数16表示将doc转换成docx
        doc.SaveAs(docx_name, 16)
        doc.Close()
        word.Quit()
    except:
        pass
    return docx_name


doc_dir = 'D:/桌面/word图片分组/docx_file/'
file = 'D:/桌面/word图片分组/docx_file/test.docx'
count = pic2dir(doc_dir, file)