python把文本转换为html_使用Python3将Markdown(.md)文本转换成 html、pdf

importos,reimportsys,getoptfrom enum importEnumfrom subprocess importcallfrom functools importreducefrom docopt importdocopt__version__ = '1.0'

#定义三个枚举类#定义表状态

classTABLE(Enum):

Init= 1Format= 2Table= 3

#有序序列状态

classORDERLIST(Enum):

Init= 1List= 2

#块状态

classBLOCK(Enum):

Init= 1Block= 2CodeBlock= 3

#定义全局状态,并初始化状态

table_state =TABLE.Init

orderList_state=ORDERLIST.Init

block_state=BLOCK.Init

is_code=False

is_normal=True

temp_table_first_line=[]

temp_table_first_line_str= ""need_mathjax=Falsedeftest_state(input):globaltable_state, orderList_state, block_state, is_code, temp_table_first_line, temp_table_first_line_str

Code_List= ["python\n", "c++\n", "c\n"]

result=input#构建正则表达式规则

#匹配块标识

pattern = re.compile(r'```(\s)*\n')

a=pattern.match(input)#普通块

if a and block_state ==BLOCK.Init:

result= "

"block_state=BLOCK.Block

is_normal=False#特殊代码块

elif len(input) > 4 and input[0:3] == '```' and (input[3:9] == "python" or input[3:6] == "c++" or input[3:4]== "c") and block_state ==BLOCK.Init:

block_state=BLOCK.Block

result= ""is_code=True

is_normal=False#块结束

elif block_state == BLOCK.Block and input == '```\n':ifis_code:

result= ""

else:

result= "

"block_state=BLOCK.Init

is_code=False

is_normal=Falseelif block_state ==BLOCK.Block:

pattern= re.compile(r'[\n\r\v\f\ ]')

result= pattern.sub("&nbsp", result)

pattern= re.compile(r'\t')

result= pattern.sub("&nbsp" * 4, result)

result= "" + result + ""is_normal=False#解析有序序列

if len(input) > 2 and input[0].isdigit() and input[1] == '.' and orderList_state ==ORDERLIST.Init:

orderList_state=ORDERLIST.List

result= "

  1. " + input[2:] + ""is_normal=Falseelif len(input) > 2 and input[0].isdigit() and input[1] == '.' and orderList_state ==ORDERLIST.List:

result= "

" + input[2:] + ""is_normal=Falseelif orderList_state == ORDERLIST.List and (len(input) <= 2 or input[0].isdigit() == False or input[1] != '.'):

result= "

" +input

orderList_state=ORDERLIST.Init#解析表格

pattern = re.compile(r'^((.+)\|)+((.+))$')

match=pattern.match(input)ifmatch:

l= input.split('|')

l[-1] = l[-1][:-1]#将空字符弹出列表

if l[0] == '':

l.pop(0)if l[-1] == '':

l.pop(-1)if table_state ==TABLE.Init:

table_state=TABLE.Format

temp_table_first_line=l

temp_table_first_line_str=input

result= ""

elif table_state ==TABLE.Format:#如果是表头与表格主题的分割线

if reduce(lambda a, b: a and b, [all_same(i,'-') for i inl], True):

table_state=TABLE.Table

result= "

for i intemp_table_first_line:

result+= "

" + i + ""result+= ""result+= ""is_normal=Falseelse:

result= temp_table_first_line_str + "" +input

table_state=TABLE.Initelif table_state ==TABLE.Table:

result= "

"

for i inl:

result+= "

" + i + ""result+= ""

elif table_state ==TABLE.Table:

table_state=TABLE.Init

result= "

" +resultelif table_state ==TABLE.Format:pass

returnresult#判断 lst 是否全由字符 sym 构成

defall_same(lst, sym):return not lst or sym * len(lst) ==lst#处理标题

defhandleTitle(s, n):

temp= "" + s[n:] + ""

returntemp#处理无序列表

defhandleUnorderd(s):

s= "

  • " + s[1:]

    s+= "

"

returnsdeftokenTemplate(s, match):

pattern= ""

if match == '*':

pattern= "\*([^\*]*)\*"

if match == '~~':

pattern= "\~\~([^\~\~]*)\~\~"

if match == '**':

pattern= "\*\*([^\*\*]*)\*\*"

returnpattern#处理特殊标识,比如 **, *, ~~

deftokenHandler(s):

l= ['b', 'i', 'S']

j=0for i in ['**', '*', '~~']:

pattern=re.compile(tokenTemplate(s,i))

match=pattern.finditer(s)

k=0for a inmatch:ifa:

content= a.group(1)

x,y=a.span()

c= 3

if i == '*':

c= 5s= s[:x+c*k] + "" + content + "" + l[j] + ">" + s[y+c*k:]

k+= 1pattern= re.compile(r'\$([^\$]*)\$')

a=pattern.search(s)ifa:globalneed_mathjax

need_mathjax=True

j+= 1

returns#处理链接

deflink_image(s):#超链接

pattern = re.compile(r'\\\[(.*)\]\((.*)\)')

match=pattern.finditer(s)for a inmatch:ifa:

text, url= a.group(1,2)

x, y=a.span()

s= s[:x] + "" + text + "" +s[y:]#图像链接

pattern = re.compile(r'!\[(.*)\]\((.*)\)')

match=pattern.finditer(s)for a inmatch:ifa:

text, url= a.group(1,2)

x, y=a.span()

s= s[:x] + "" + "" +s[y:]#角标

pattern = re.compile(r'(.)\^\[([^\]]*)\]')

match=pattern.finditer(s)

k=0for a inmatch:ifa:

sym,index= a.group(1,2)

x, y=a.span()

s= s[:x+8*k] + sym + "" + index + "" + s[y+8*k:]

k+= 1

returnsdefparse(input):globalblock_state, is_normal

is_normal=True

result=input#检测当前 input 解析状态

result =test_state(input)if block_state ==BLOCK.Block:returnresult#分析标题标记 #

title_rank =0for i in range(6, 0, -1):if input[:i] == '#'*i:

title_rank=ibreak

if title_rank !=0:#处理标题,转化为相应的 HTML 文本

result =handleTitle(input, title_rank)returnresult#分析分割线标记 --

if len(input) > 2 and all_same(input[:-1], '-') and input[-1] == '\n':

result= "


"

returnresult#解析无序列表

unorderd = ['+', '-']if result != "" and result[0] inunorderd :

result=handleUnorderd(result)

is_normal=False

f=input[0]

count=0

sys_q=Falsewhile f == '>':

count+= 1f=input[count]

sys_q=Trueifsys_q:

result= "

"*count + " " + input[count:] + "" + "
"*count

is_normal=False#处理特殊标记,比如 ***, ~~~

result =tokenHandler(result)#解析图像链接

result =link_image(result)

pa= re.compile(r'^(\s)*$')

a=pa.match(input)if input[-1] == "\n" and is_normal == True and nota :

result+=""

returnresultdefrun(source_file, dest_file, dest_pdf_file, only_pdf):#获取文件名

file_name =source_file#转换后的 HTML 文件名

dest_name =dest_file#转换后的 PDF 文件名

dest_pdf_name =dest_pdf_file#获取文件后缀

_, suffix =os.path.splitext(file_name)if suffix not in [".md",".markdown",".mdown","mkd"]:print('Error: the file should be in markdown format')

sys.exit(1)ifonly_pdf:

dest_name= ".~temp~.html"f= open(file_name, "r")

f_r= open(dest_name, "w")#往文件中填写 HTML 的一些属性

f_r.write("""

#wrapper { width: 100%;height:100%; margin: 0; padding: 0;}#left { float:left; \

width: 10%; height: 100%; }#second { float:left; width: 80%;height: 100%; \

}#right {float:left; width: 10%; height: 100%; \

}

""")

f_r.write("""""")#逐行解析 markdwon 文件

for eachline inf:

result=parse(eachline)if result != "":

f_r.write(result)

f_r.write("""

""")#公式支持

globalneed_mathjaxifneed_mathjax:

f_r.write("""

MathJax.Hub.Config({tex2jax: {inlineMath: [['$','$'], ['\\(','\\)']]}});\

""")#文件操作完成之后记得关闭!!!

f_r.close()

f.close()#调用扩展 wkhtmltopdf 将 HTML 文件转换成 PDF

if dest_pdf_name != "" oronly_pdf:

call(["wkhtmltopdf", dest_name, dest_pdf_name])#如果有必要,删除中间过程生成的 HTML 文件

ifonly_pdf:

call(["rm", dest_name])#主函数

defmain():

dest_file= "translation_result.html"dest_pdf_file= "translation_result.pdf"only_pdf=False

args= docopt(__doc__, version=__version__)

dest_file= args[''] if args['--output'] elsedest_file

dest_pdf_file= args[''] if args['--print'] or args['--Print'] else ""run(args[''], dest_file, dest_pdf_file, args['--Print'])if __name__=="__main__":

main()

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值