from docx import Document
import re
doc=Document('D:\\lawText\\text.docx')########所存的word文件路径
extracted_titles = []
########正则法
pattern = re.compile(r'^第[\d一二三四五六七八九十十一百千万亿]+[章节][\u4e00-\u9fa5 \s]*$')
文件读取
#####看一下文本内容
full_text = []
for para in doc.paragraphs:
full_text.append(para.text)
full_text
#######提取第几章标题
for para in doc.paragraphs:
# 使用正则表达式查找匹配项
match = pattern.match(para.text)
if match:
# 如果找到匹配项,则添加到提取的标题列表中
extracted_titles.append(match.group())
for title in extracted_titles:
print(title)
print(extracted_titles)