Latex因其公式排版整洁,受到各类研究者的广泛使用。然而由于latex排版非实时可见,进而有可能造成写论文时用word排版,提交论文时转成latex排版的情况。而由于word和latex排版方式不完全一致,因而对研究者投稿进度造成一定的延误。基于个人经验,word转Latex的一大难题就是公式排版转化,本文拟借助Python来实现这一方面的自动转化。
第一步:生成测试文档
新建一个txt文件。将包含公式的测试文本从word复制到txt中。测试文本如下:
Let L(q,Q)=1-\bar{L}(q,Q), where \bar{L}(q,Q)=\bar{F}(q) \bar{G}(q/Q).
For any given Q, let l_q (q)=\partial L(q,Q)/\partial q=f(q) \bar{G}(q/Q)+(\bar{F}(q))/Q.
According to assumptions of f(∙) and g(∙), we know f(q)=\bar{F}(q)h(q) and g(q/Q)=\bar{G}(q/Q)k(q/Q).
Further, its hazard function M(q)=(ql_q (q))/(\bar{L}(q,Q) )=H(q)+K(q/Q), and M(q)=H(q)+(K (q/Q))/Q>0. Thus, M(q) is increasing in q≥0.
For any given q, let l_Q (Q)=\partial L(q,Q)/\partial Q= \bar{F}(q)g(q/Q)=-q/Q \bar{F}(q)k(q/Q) \bar{G}(q/Q)= \bar{L}(q,Q)K(q/Q).
Further, its hazard function N(Q)=(Ql_Q (Q))/(\bar{L}(q,Q) )=-K (q/Q), which is increasing in Q≥0.〖
同时,导入可能用到的包,并定义一些后期用到的列表。
import string #关于字符串的处理函数
import os #与操作系统/文件交互的一个接口
mathnota=["_", "^","+","-","≤","≥","<",">","=","/","∙","¯",")","(","$"]
brac_nota=[")","("]
extra_nota=["w","q","Q","k","yQ"]
punc_nota=[",","."]
第二步:文件清洗
在将文本从Word复制到txt过程中,有时会出现部分字符异常,出现诸如〖","〗"等字符,因此需要事先删除。
def clean(file):#file是读取的txt文件
clean_set=["〖","〗","■"] #给出异常字符集合
new_file=[] #存储去除掉异常字符的文本
for paragraphs in f1.readlines(): #paragraphs是拆成的段
paragraphs = paragraphs.strip("\n") #去掉换行符
new_paragraph=[]
words = paragraphs.split() #words是段变成字词。拆过程中,以空格作为分隔符。
for index_word, content_word in enumerate(words): #把每个字词单独形成一个list
new_word=[];
for index_charc,word_charc in enumerate(content_word): #检查每个字词中的字符串
if word_charc not in clean_set:#出现异常值
new_word.append(word_charc)
new_word=''.join(new_word)
new_paragraph.append(new_word)
new_paragraph=' '.join(new_paragraph)
new_file.append(new_paragraph)
return new_file
第三步,删除空格造成的公式分割
同样得,在将文本从Word复制到txt过程中,字词之间可能会出现多余的空格,进而影响到后期的公式识别。因此,为了保证公式识别的准确性,需事先对异常空格进行识别并删除。
def delete_empty(file):
new_file=[]
for index_paragraphs, content_paragraphs in enumerate(file):
#print(index_paragraphs)
content_paragraphs = content_paragraphs.split() #words是段变成字词。拆过程中,以空格作为分隔符。
math=[0]*(len(content_paragraphs)) #多设置一个位置
new_paragraph=[]
#print (math_index)
for index_word, content_word in enumerate(content_paragraphs): #把每个字词单独形成一个list
#print(index_word)
#print(content_word)
for index_charc,word_charc in enumerate(content_word): #检查每个字词中的字符串
if (word_charc in mathnota) or (content_word in extra_nota): #含有特定字符, #前后两个字符串不能以符号、括号间隔
math[index_word]=1
break
for index_math in range(0,len(math)):
#print(index_math)
#print(content_paragraphs[index_math-1])
if math[index_math-1]==1 and math[index_math]==1: #1与1
new_paragraph.append(content_paragraphs[index_math])
else: #0与0,或者1与0
new_paragraph.append(" "+content_paragraphs[index_math])
#print(new_paragraph)
#print(new_paragraph)
new_paragraph="".join(new_paragraph)
new_file.append(new_paragraph)
return new_file
第四步: 识别公式,并在其前后增添$符号
def add_notation(file):
new_file=[]
for index_paragraphs, content_paragraphs in enumerate(file):
content_paragraphs = content_paragraphs.split() #words是段变成字词。拆过程中,以空格作为分隔符。
new_paragraph=[]
for index_word, content_word in enumerate(content_paragraphs): #把每个字词单独形成一个list
if content_word in extra_nota:
new_paragraph.append("$"+content_word+"$")
else:
for index_charc,word_charc in enumerate(content_word): #检查每个字词中的字符串
if word_charc in mathnota and (word_charc not in brac_nota):#含有特定字符
if content_word[-1] in punc_nota:#最后一个字符是标点符号
new_paragraph.append("$"+content_word[0:-1]+"$"+content_word[-1])
break
else:
new_paragraph.append("$"+content_word+"$")
break
elif index_charc==len(content_word)-1:
new_paragraph.append(content_word)
break
new_paragraph=' '.join(new_paragraph)
new_file.append(new_paragraph)
return new_file
第五步: 对分数形式进行识别,并调整代码
def select_value(index_left,index_right,place_slash,judge):
if judge==2:# 前有,寻找对应的(, k(q/Q)
right_position=-1
left_position=-1
print(index_left)
print(index_right)
right_value=index_right[right_position]
left_value=index_left[left_position]
while (right_value>left_value):
#if right_position<left_position:
if right_position==left_position:
if abs(right_position)==len(index_right):
right_value=index_right[0]
break
else:
right_position=right_position-1
right_value=index_right[right_position]
else:
if abs(left_position)==len(index_left):
left_value=index_left[0]
break
else:
left_position=left_position-1
left_value=index_left[left_position]
return left_value
else: #后有
right_position=0
left_position=0
right_value=index_right[right_position]
left_value=index_left[left_position]
while (right_value>left_value):
#if right_position<left_position:
if right_position==left_position:
if abs(left_position)==len(index_left):
left_value=index_left[-1]
break
else:
left_position=left_position+1
left_value=index_left[left_position]
else:
if abs(right_position)==len(index_right):
right_value=index_right[-1]
break
else:
right_position=right_position+1
right_value=index_right[right_position]
return right_value
def fractile(file):
#最开始需要判断()是否多余,例如q/Q, q/(Q)
#然而在word转txt公式中,如果/公式上下没有符号,就不会自动添加括号
#因此可以认为txt中的公式都不多余
#math_index.append(0) #判断每个字符串是否是公式,0不是,1是
new_file=[]
for index_paragraphs, content_paragraphs in enumerate(file):
content_paragraphs = content_paragraphs.split() #words是段变成字词。拆过程中,以空格作为分隔符。
new_paragraph=[]
for index_word, content_word in enumerate(content_paragraphs): #把每个字词单独形成一个list
num_left_parenthesis=[] #应对除法
num_right_parenthesis=[]
num_left=0
num_right=0
state_right_paranthesis=0 #无变化
state_left_paranthesis=0 #后半部专用,无变化0
#divide=0 #判断是否出现分数形式
new_word=[]
index_left_parenthesis=[]
index_right_parenthesis=[]
index_slash=[]
index_math=[]
num_slash=1
for index_charc,word_charc in enumerate(content_word): #检查每个字词中的字符串
new_word.append(word_charc)
if word_charc=="(":
index_left_parenthesis.append(index_charc)
elif word_charc==")":
index_right_parenthesis.append(index_charc)
elif word_charc=="/":
index_slash.append(index_charc)
num_slash=num_slash+1
elif word_charc in mathnota: #所有计算符号位置,除了上述几个
index_math.append(index_charc)
if index_slash:# 存在值即为True
#index_left_parenthesis=np.array(index_left_parenthesis).reshape(len(index_slash),int((len(index_left_parenthesis))/2))
#index_right_parenthesis=np.array(index_right_parenthesis).reshape(len(index_slash),int((len(index_right_parenthesis))/2))
print(index_left_parenthesis)
print(index_right_parenthesis)
print(index_slash)
#print(new_word)
for item,item_slash in enumerate(index_slash):
print ("60=",item_slash)
left_upper=0
left_lower=left_upper
print("1aa")
print(index_left_parenthesis)
print(left_upper)
while index_left_parenthesis[left_upper]<item_slash:
if left_upper<len(index_left_parenthesis)-1:
left_upper=left_upper+1
#print(left_upper)
#print("1bb")
else:
break
print(left_upper)
#print("cc")
right_lower=left_lower
right_upper=left_upper
if new_word[item_slash-1]==")": #前有,需判断下是什么性质的
#print(index_left_parenthesis)
#print(left_upper)
#print(index_right_parenthesis)
#如果括号里面没有数学符号
count_move=1
count_math=0
count_right=0
while new_word[item_slash-count_move]!="(":
#print(new_word[item_slash-count_move])
count_move=count_move+1
if new_word[item_slash-count_move] in mathnota and new_word[item_slash-count_move]!="(":
count_math=count_math+1
if new_word[item_slash-count_move]==")":
count_right=count_right+1
if count_right>=2:
count_math=count_math+1
break
#print(new_word[item_slash-count_move])
print(count_math)
print(count_move)
print(count_right)
if count_math==0:#如果括号里面没有数学符号
#print(item_slash)
#print("abc")
count_move=0#重置
while new_word[item_slash-count_move]!="=":
count_move=count_move+1
if (item_slash-count_move)<=0:
break
#print(new_word[item_slash-count_move])
left_insert=item_slash-count_move
print("zzz")
#print(new_word[left_insert])
#print(left_insert)
#print(item_slash)
#print(count_move)
new_word[left_insert]="=\\frac{"
new_word[item_slash]="}"
print(new_word)
else:#如果括号里面有数学符号
print("bcd")
print(len(index_left_parenthesis))
if len(index_left_parenthesis)==1:
left_insert=index_left_parenthesis[0]
print("1111")
else:
print("2222")
print(index_left_parenthesis[left_lower:left_upper])
print(left_lower)
print(left_upper)
print(index_right_parenthesis[right_lower:right_upper])
left_insert=select_value(index_left_parenthesis[left_lower:left_upper],index_right_parenthesis[right_lower:right_upper],item_slash,2)
print(left_insert)
new_word[left_insert]="\\frac{"
#print(new_word[item_slash-1])
new_word[item_slash-1]="}"
#print(new_word[item_slash])
new_word[item_slash]=""
print (new_word)
#开始后半部分
if new_word[item_slash+1]=="(":
new_word[item_slash+1]="{"
#print(index_left_parenthesis)
#print(left_upper)
#print(len(index_left_parenthesis))
if len(index_left_parenthesis)==1:
right_insert=index_right_parenthesis[0]
else:
if left_upper==len(index_left_parenthesis)-1:
right_insert=index_right_parenthesis[right_upper]
else:
right_insert=select_value(index_left_parenthesis[left_upper:len(index_left_parenthesis)],index_right_parenthesis[right_upper:len(index_right_parenthesis)],item_slash,4)
#print (right_insert)
new_word[right_insert]=")}"
else: #后无
print("det")
print(new_word)
print(new_word[item_slash])
print(item_slash)
new_word[item_slash]=new_word[item_slash]+"{"
count_move=0
while new_word[item_slash+count_move] not in mathnota:
print(new_word[item_slash+count_move])
if (item_slash+count_move)>=len(new_word)-1:
break
count_move=count_move+1
#print("fff")
#print(new_word[item_slash+count_move])
#print(item_slash+count_move)
#print(count_move)
new_word[item_slash+count_move]="}"+new_word[item_slash+count_move]
else: #前无 例子 F(q/(Q)), 3+q/Q, q/Q
#仅需判定最近的(位置
count_move=1
while new_word[item_slash-count_move] not in mathnota:
#print(new_word[item_slash-count_move])
count_move=count_move+1
left_insert=item_slash-count_move
print("ddef")
new_word[left_insert]="(\\frac{"
new_word[item_slash]="}"
#print("ee")
print(new_word)
#开始后半部分
if new_word[item_slash+1]=="(":
print("xxx")
new_word[item_slash+1]="{"
#print(index_left_parenthesis)
#print(left_upper)
#print(len(index_left_parenthesis))
if len(index_left_parenthesis)==1:
right_insert=index_right_parenthesis[0]
else:
if left_upper==len(index_left_parenthesis)-1:
right_insert=index_right_parenthesis[right_upper]
else:
right_insert=select_value(index_left_parenthesis[left_upper:len(index_left_parenthesis)],index_right_parenthesis[right_upper:len(index_right_parenthesis)],item_slash,4)
#print (right_insert)
new_word[right_insert]=")}"
else: #后无
print("yyy")
new_word[item_slash]=new_word[item_slash]+"{"
count_move=0
while new_word[item_slash+count_move] not in mathnota:
count_move=count_move+1
#if new_word[item_slash+count_move]=="Q":
#break
print("fff")
print(new_word[item_slash])
print(new_word[item_slash+count_move])
print(item_slash+count_move)
print(count_move)
new_word[item_slash+count_move]="}"+new_word[item_slash+count_move]
# q=f(q) \bar{G}(q/Q)+(\bar{F}(q))/Q g(q/Q)
##没有else
new_word=''.join(new_word)
new_paragraph.append(new_word)
new_paragraph=' '.join(new_paragraph)
#print(new_paragraph)
new_file.append(new_paragraph)
return new_file
第六步:对特定符号进行代码修正
def replace(file): #file为列表形式
#replace_before=["∈","≥","≤","∂","∞"]
replace_before=["∈","≥","≤","∂","\partial","∞","α","β"] #¯
replace_after=["\in ","\geq ","\leq ","\partial ","\partial ","\infty ","\alpha ","\beta"] #$\bar{F}$ ,"\bar{}"
new_file=[]
for index_paragraphs, content_paragraphs in enumerate(file): #检查每个字词中的字符串
print(index_paragraphs)
new_paragraph=[]
content_paragraphs = content_paragraphs.split()
for index_word, content_word in enumerate(content_paragraphs): #把每个字词单独形成一个list
print(index_word)
for shizaixiangbulaile in range(len(replace_before)):
if replace_before[shizaixiangbulaile] in content_word:
#sequnce=content_word.find(replace_before[shizaixiangbulaile])
print(content_word)
print(replace_before[shizaixiangbulaile])
print(replace_after[shizaixiangbulaile])
content_word=content_word.replace(replace_before[shizaixiangbulaile],replace_after[shizaixiangbulaile])
print(content_word)
new_paragraph.append(content_word)
print(new_paragraph)
new_paragraph=' '.join(new_paragraph)
new_file.append(new_paragraph)
return new_file
第七步:生成主程序
with open('before.txt','r',encoding="utf-8") as f1, open('after.txt','wb') as bs: #f1是需要转换的原文,f2是最终生成的文档
f2=clean(f1) #清洗文本
f3=delete_empty(f2) #删除公式中的空格
f4=add_notation(f3) #添加$
f5=fractile(f4)
f6=replace(f5) #替换数学符号
f7 = str.encode(''.join(f6))
bs.write(f7)
bs.close()
题外话:由于并未大规模测试代码,所以其适用性存在问题(百分之百有问题)。当然,还有两种更简单的排版策略,其一,直接花钱买mathtype插件。该插件具备一键转换Tex功能。其二,让学生帮忙改,别问我怎么知道的。