python3 word表格中提取图片(w:tr,w:tc)
用户需求
图片展示:
另外一个是:
直接上代码
# 日期:2022-8-16
# 时间:14:38
# 用户:海依拉提江
# 项目名称:8-11
# 文件名称:csdn放入不同的文件夹
# encoding=utf-8
import os
import zipfile
import xml
from xml.dom.minidom import parse
import shutil # 引入os(文件及目录操作)、zipfile(zip文件操作)、shutil(拷贝文件)库
import docx
from win32com import client
def pic2dir(doc_dir, file):
file_zip = file[:-5] + '.ZIP'
os.rename(file, file_zip) # 重命名为zip文件
tmp_path = doc_dir + 'tmp'
f = zipfile.ZipFile(file_zip, 'r')
for img_file in f.namelist():
if "word" in img_file:
# 将压缩包里的word文件夹解压出来
f.extract(img_file, tmp_path)
f.close()
os.rename(file_zip, file)
media_xml = os.path.join(tmp_path, 'word/_rels/document.xml.rels')
xml_data = parse(media_xml)
data = xml_data.documentElement
Relationships = data.getElementsByTagName('Relationship')
img_dict = {}
for Relationship in Relationships:
id = Relationship.getAttribute('Id')
target = Relationship.getAttribute('Target').replace('media/', '')
img_dict[id] = [target]
# print(img_dict)
docum = os.path.join(tmp_path, 'word/document.xml')
xml_data = parse(docum)
print(xml_data)
data = xml_data.documentElement
trs = data.getElementsByTagName('w:tc')
# print(tables)
img_data = []
for i, tr in enumerate(trs):
try:
# print(tr.getElementsByTagName('a:blip')[0].getAttribute('r:embed'))
rids = tr.getElementsByTagName('a:blip')
for rid in rids:
embed = rid.getAttribute('r:embed')
pic_path = doc_dir + str(i)
if not os.path.exists(pic_path):
os.mkdir(pic_path)
shutil.copy(os.path.join(tmp_path + '/word/media', str(
img_dict[embed][0])), os.path.join(pic_path, str(img_dict[embed][0])))
print(embed, i)
except Exception as E:
print(E)
for i in os.listdir(tmp_path):
# 如果是文件夹则删除
if os.path.isdir(os.path.join(tmp_path, i)):
shutil.rmtree(os.path.join(tmp_path, i))
return 1
def doc2docx(doc_name):
try:
# 首先将doc转换成docx
word = client.Dispatch("Word.Application")
doc = word.Documents.Open(doc_name)
docx_name = doc_name[:-4] + '.docx'
# 使用参数16表示将doc转换成docx
doc.SaveAs(docx_name, 16)
doc.Close()
word.Quit()
except:
pass
return docx_name
doc_dir = 'D:/桌面/word图片分组/docx_file/'
file = 'D:/桌面/word图片分组/docx_file/test.docx'
count = pic2dir(doc_dir, file)