一、使用的主要模块
mammoth markdownify time pathlib os
二、代码
2.1 将word转变为HTML
def wordToHtml ( filename) :
with open ( filename+ ".docx" , "rb" ) as docx_file:
result = mammoth. convert_to_html( docx_file)
with open ( filename+ ".html" , "w" ) as html_file:
html_file. write( result. value)
2.2 将word转变为MD
def wordToMd ( filename) :
with open ( filename+ ".docx" , "rb" ) as docx_file:
result = mammoth. convert_to_markdown( docx_file)
with open ( filename+ ".md" , "w" ) as markdown_file:
markdown_file. write( result. value)
2.3 支持word中有图片转为MD
def MD ( filename) :
with open ( filename+ ".docx" , "rb" ) as docx_file:
result = mammoth. convert_to_html( docx_file, convert_image= mammoth. images. img_element( convert_img) )
html = result. value
md = markdownify( html, heading_style= "ATX" )
with open ( "./docx_to_md.md" , "w" , encoding= 'utf-8' ) as md_file:
md_file. write( md)
messages = result. messages
def convert_img ( image) :
with image. open ( ) as image_bytes:
file_suffix = image. content_type. split( "/" ) [ 1 ]
my_img = Path( "./img" )
if my_img. is_dir( ) == 0 :
os. makedirs( "img" )
path_file = "./img/{}.{}" . format ( str ( time. time( ) ) , file_suffix)
with open ( path_file, 'wb' ) as f:
f. write( image_bytes. read( ) )
return { "src" : path_file}