python之调用pdf2docx
pdf2docx
支持Windows和Linux平台,要求Python版本>=3.6
下载
pip install pdf2docx
使用
可以使用类Converter 和 方法 pasrse()。
使用类Converter
Sample One
from pdf2docx import Converter
pdf_file = '/path/to/sample.pdf'
docx_file = 'path/to/sample.docx'
# convert pdf to docx
cv = Converter(pdf_file)
cv.convert(docx_file) # all pages by default
cv.close()
Sample Two
import os
from pdf2docx import Converter
def ConverterByFolder(sourcePath,targetPath):
"""
用到 os.listdir(path) 获取目录下的文件 返回的是数组
os.path.splitext(fileName) 分离文件的名字 和 后缀 返回的是元组('fun','.png')
"""
sufixx = '.docx'
dir_list = os.listdir(sourcePath)
for l in dir_list:
pdfsufixx = os.path.splitext(l)[1]
if pdfsufixx == ".pdf":
pdf_file = sourcePath +'\\' +l
fileName = os.path.splitext(l)[0]
docx_file = targetPath + "\\"+ fileName + sufixx
cv = Converter(pdf_file)
cv.convert(docx_file) # all pages by default
cv.close()
print(l,"=="*10,'>',"done")
if __name__ == '__main__':
ConverterByFolder("F:\Desktop","F:\Desktop")
使用方法parse()
Sample One
from pdf2docx import parse
pdf_file = '/path/to/sample.pdf'
docx_file = 'path/to/sample.docx'
# convert pdf to docx
parse(pdf_file, docx_file)
Sample Two
import os
from pdf2docx import parse
def ConverterByFolder_parse(sourcePath,targetPath):
"""
用到 os.listdir(path) 获取目录下的文件 返回的是数组
os.path.splitext(fileName) 分离文件的名字 和 后缀 返回的是元组('fun','.png')
"""
sufixx = '.docx'
dir_list = os.listdir(sourcePath)
for l in dir_list:
pdfsufixx = os.path.splitext(l)[1]
if pdfsufixx == ".pdf":
pdf_file = sourcePath +'\\' +l
fileName = os.path.splitext(l)[0]
docx_file = targetPath + "\\"+ fileName + sufixx
parse(pdf_file,docx_file)
print(l,"=="*10,'>',"done")
if __name__ == '__main__':
ConverterByFolder_parse("F:\Desktop","F:\Desktop")