import docx
import os
from io import StringIO
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
#from pdfminer.pdfpage import PDFPage
def preprocess_text(text):
text = ' '.join(text.split())
text = join_name_tag(text)
return text
def join_name_tag(text):
text = text.replace('\u2003', '').replace(' ','').replace('\ue5e5','').replace('\xae','').replace('\ufb01','').replace('\u2022','').replace('\u2212','')
return text
#读取段落和表格
def docx_to_text(file_path):
doc = docx.Document(file_path)
result = []
for p in doc.paragraphs:
txt = p.text.strip()
#txt.replace(' ', '') #去除空格
if txt != '':
#txt.replace('\n', '') #去掉换行符
txt = preprocess_text(txt)
result.append(txt)
for q in doc.tables:
for row in q.rows:
txt = ''
for col in range(len(q.columns)):
try:
if row.cells[col].text != '' and row.cells[col].text not in txt:
txt = txt + row.cells[col].text + ' '
except:
pass
if txt != '':
txt = preprocess_text(txt)
result.append(txt)
return result
#读取pdf文本
def pdf_to_text(fname, pages=None):
if not pages:
pagenums = set()
else:
pagenums = set(pages)
output = StringIO()
manager = PDFResourceManager()
converter = TextConverter(manager, output, laparams=LAParams())
interpreter = PDFPageInterpreter(manager, converter)
infile = open(fname, 'rb')
for page in PDFPage.get_pages(infile, pagenums):
interpreter.process_page(page)
infile.close()
converter.close()
text = output.getvalue()
output.close()
result = []
for line in text.split('\n'):
line2 = line.strip()
line2 = preprocess_text(line2)
if line2 != '':
result.append(line2)
return result
def read_pdf_and_docx(file_path):
#读取docx or pdf文件
txt = None
if file_path.lower().endswith('.docx'):
print('extracting text from docx: ', file_path)
txt = docx_to_text(file_path)
elif file_path.lower().endswith('.pdf'):
print('extracting text from pdf: ', file_path)
txt = pdf_to_text(file_path)
if txt is not None and len(txt) > 0:
return txt
'''
#读取文件夹内的所有文件
def read_pdf_and_docx(dir_path, collected=None, command_logging=False, callback=None):
if collected is None:
collected = dict()
for f in os.listdir(dir_path):
file_path = os.path.join(dir_path, f)
if os.path.isfile(file_path):
txt = None
if f.lower().endswith('.docx'):
if command_logging:
print('extracting text from docx: ', file_path)
txt = docx_to_text(file_path)
elif f.lower().endswith('.pdf'):
if command_logging:
print('extracting text from pdf: ', file_path)
txt = pdf_to_text(file_path)
if txt is not None and len(txt) > 1:
if callback is not None:
callback(len(collected), file_path, txt)
collected[file_path] = txt
elif os.path.isdir(file_path):
read_pdf_and_docx(file_path, collected, command_logging, callback)
return collected
'''
if __name__ == '__main__':
data_dir_path = './test_read_docx'
for f in os.listdir(data_dir_path):
data_file_path = os.path.join(data_dir_path, f)
if os.path.isfile(data_file_path) and f.lower().endswith('.docx'):
collected = read_pdf_and_docx(data_file_path)
print(collected)
docx pdf读取
最新推荐文章于 2024-06-09 10:04:43 发布