主要目的
从政府报告等文档中直接提取标题复制到xmind思维导图中
修改list勾选标题格式
遇到的问题
exec内的赋值是局部变量
中文标点需要单独转码匹配
源代码
import re
def pattern1(string):
global tmp
#patterns =re.compile(r'[\u4e00-\u9fa5]')
patterns=re.compile(r"[一二三四五六七八九十]+、[\u4e00-\u9fa5\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\w]+")
tmp=re.match(patterns,string,0)
if tmp==None:
return tmp
else:
tmp = tmp.group(0)
return
def pattern2(string):
global tmp
pattern=re.compile(r"[\((][一二三四五六七八九十]+[\))][\u4e00-\u9fa5\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\w]+?\u3002")
tmp=re.match(pattern,string,0)
if tmp==None:
return tmp
else:
tmp = tmp.group(0)
return
def pattern3(string):
global tmp
pattern=re.compile(r"\d+[、.][\u4e00-\u9fa5\uff1b\uff0c\uff1a\u201c\u201d\uff08\uff09\u3001\uff1f\u300a\u300b\w]+?\u3002")
tmp=re.match(pattern,string,0)
if tmp==None:
return tmp
else:
tmp = tmp.group(0)
return
def check(string):
global tmp
list=[1,2,3]
for l in list:
exec(f'pattern{l}(string)')
if tmp!=None:
print(tmp)
return tmp
return None
path=r'E:\huang\Desktop\AAA粘贴处理.txt'
output=r'E:\huang\Desktop\BBB标题输出.txt'
file=open(output,'w',encoding='ansi')
with open(path,'r') as f:
for line in f.readlines():
l=check(line.strip())
if l!=None:
file.write(l.strip()+'\n')
file.close()
1285

被折叠的 条评论
为什么被折叠?



