python docx 提取图片_使用Python从Word和Excel中提取图片

1586010002-jmsa.png

I was searching for a way to strip out pictures from these file types and this is the solution I came up with. It iterates through a given directory structure, copies any files with the proper extension, and renames the copy to filename.zip. Then it navigates through the zip structure and extracts all picture type files with the proper extension, and renames them to the original file name, with a number for uniqueness. Finally, it deletes the extracted directory trees it created.

Extracting pictures from text documents is part of my job, so this will actually save my company thousands of hours in the long run.

All of the code is below, and what I'm really asking is: Is there a better way? Is there something more efficient? Can it be scaled to include other formats? Could the text be extracted into a txt - for loading times on word vs notepad?

This solution works on my Linux machine, and I can extract the pictures, but I've yet to test on a Windows system.

#!/usr/bin/python3

import shutil

import os

import zipfile

def zipDoc(aFile,dirPath):

dotNDX = aFile.index(".") # position of the .

shortFN = aFile[:dotNDX] # name of the file before .

zipName = dirPath + shortFN + ".zip" # name and path of the file only .zip

shutil.copy2(dirPath + aFile, zipName) # copies all data from original into .zip format

useZIP = zipfile.ZipFile(zipName) # the usable zip file

return useZIP # returns the zipped file

def hasPicExtension(aFile): # if a file ends in a typical picture file extension, returns true

picEndings = [".jpeg",".jpg",".png",".bmp",".JPEG"".JPG",".BMP",".PNG"] # list of photo extensions

if aFile.endswith(tuple(picEndings)): # turn the list into a tuple, because .endswith accepts that

return True

else: # if it doesn't end in a picture extension

return False

def delDOCXEvidence(somePath): # removes the .docx file structures generated

##################################################################

# Working Linux code:

os.rmdir(somePath + "/word/media") # removes directory

os.rmdir(somePath + "/word") # removes more directory

##################################################################

##################################################################

# Untested windows code:

# os.rmdir(somePath + "\\\\word\\\\media") # removes directory

# os.rmdir(somePath + "\\\\word") #removes more directory

##################################################################

def delXLSXEvidence(somePath): # removes the .xlsx file structures generated

##################################################################

# Working Linux code:

os.rmdir(somePath + "/xl/media") # removes directory

os.rmdir(somePath + "/xl") # removes more directory

##################################################################

##################################################################

# Untested windows code:

# os.rmdir(somePath + "\\\\xl\\\\media") # removes directory

# os.rmdir(somePath + "\\\\xl") #removes more directory

##################################################################

def extractPicsFromDir(dirPath=""):

# when given a directory path, will extract all images from all .docx and .xlsx file types

if os.path.isdir(dirPath): # if the given path is a directory

for dirFile in os.listdir(dirPath): # loops through all files in the directory

dirFileName = os.fsdecode(dirFile) # strips out the file name

if dirFileName.endswith(".docx"):

useZIP = zipDoc(dirFile,dirPath) # turns it into a zip

picNum = 1 # number of pictures in file

for zippedFile in useZIP.namelist(): # loops through all files in the directory

if hasPicExtension(zippedFile): # if it ends with photo

useZIP.extract(zippedFile, path=dirPath) # extracts the picture to the path + word/media/

shutil.move(dirPath + str(zippedFile),dirPath + dirFileName[:dirFileName.index(".")] + " - " + str(picNum)) # moves the picture out

picNum += 1

delDOCXEvidence(dirPath) # removes the extracted file structure

os.remove(useZIP.filename) # removes zip file

# no evidence

if dirFileName.endswith(".xlsx"):

useZIP = zipDoc(dirFile,dirPath) # turns it into a zip

picNum = 1 # number of pictures in file

for zippedFile in useZIP.namelist(): # loops through all files in the directory

if hasPicExtension(zippedFile): # if it ends with photo

useZIP.extract(zippedFile, path=dirPath) # extracts the picture to the path + word/media/

shutil.move(dirPath + str(zippedFile),dirPath + dirFileName[:dirFileName.index(".")] + " - " + str(picNum)) # moves the picture out

picNum += 1

delXLSXEvidence(dirPath) # removes the extracted file structure

os.remove(useZIP.filename) # removes zip file

# no evidence

else:

print("Not a directory path!")

exit(1)

uDir = input("Enter your directory: ")

extractPicsFromDir(uDir)

解决方案

Excel files are in the form of zip file.It is easy to extract images from excel or docx file:

import zipfile

from PIL import Image, ImageFilter

import io

blur = ImageFilter.GaussianBlur(40)

def redact_images(filename,FilePath):

outfile = filename.replace(".xlsx", "_redacted.xlsx")

with zipfile.ZipFile(filename) as inzip:

with zipfile.ZipFile(outfile, "w") as outzip:

i = 0

for info in inzip.infolist():

name = info.filename

content = inzip.read(info)

if name.endswith((".png", ".jpeg", ".gif")):

fmt = name.split(".")[-1]

Name = name.split("/")[-1]

img = Image.open(io.BytesIO(content))

img.save(FilePath + str(Name))

outb = io.BytesIO()

img.save(outb, fmt)

content = outb.getvalue()

info.file_size = len(content)

info.CRC = zipfile.crc32(content)

i += 1

outzip.writestr(info, content)

filename : Location of input excel file

FilePath : Location to save extracted images

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值