通过正则表达式来识别文章中的标题:
以参考文献为截至
import re
# 分析header
def main_read_txt():
url = "txt\\zhengwen.txt"
with open(url, "r", encoding='utf-8') as f:
count = f.readlines()
for line in count:
if line.find('参考文献(References)') < 0:
line = line.strip('\n') # 去掉列表中每一个元素的换行符
if len(line) >= 3 and line != '\x0c':
# print(line[0], line[1], line[2])
if re.match(r'\d', line):
print(line)
elif re.match(r'\(\d', line):
print(line)
else:
break
if __name__ == '__main__':
main_read_txt()