比较简单的实现,代码用到了PyPDF2,函数返回一个json结构
import PyPDF2
import json
def convert_pdf_to_text(pdf_file, text_file):
# Open the PDF file in binary mode
with open(pdf_file, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
# Get the total number of pages in the PDF
pages = pdf_reader.pages
# Initialize counters
word_count = 0
graph_count = 0
# Extract text from each page and count words and graphs
text = ''
for page in pages:
#page = pdf_reader.getPage(page_number)
text += page.extract_text()
# Count words
word_count += len(text.split())
# Save the text to a text file
with open(text_file, 'w', encoding='utf-8') as file:
file.write(text)
# Create a dictionary with the counts
counts = {
'word_count': word_count,
'page_count': len(pages)
}
# Save the counts to a JSON file
json_file = text_file.replace('.txt', '.json')
with open(json_file, 'w') as file:
json.dump(counts, file)