Python解析PDF文件中的文字
import re
import PyPDF2
import requests
url = 'https://www.baidu.com/link?url=29OahkXD4qEv8Yg4mqN6qrrDmISTcpLOjOZJ08fdu15qVLM74jSTCXmCnHGjx2lXOeM4CrWWxB6Y1ya8mtVfXMlxJgFvZxKZiitNWS2AEn7IlfaTRgsluZqHRH4bfNmcWSpMBeISAZUnQja6sibTlq&wd=&eqid=f7fcca8700022d400000000664dd9230'
response = requests.get(url)
print(type(response.content))
with open('example.pdf','rb') as pdfFile:
print(type(pdfFile))
pdfFile = response.content
pdfText = PyPDF2.PdfFileReader(pdfFile)
pdfText = pdfText.getPage(0).extractText()
pdfText = re.sub(r'\n', '', pdfText)
pdfText = re.sub(r'\s+', ' ', pdfText)
print(pdfText)