# -*- coding: utf-8 -*-
import fitz
import hashlib
import requests as req
import pytesseract
from PIL import Image
import os
title = 'aaaa'
if not os.path.isdir('./xxxx_test'):
os.mkdir('./xxxx_test')
files_path = './xxxx_test'
rename_title = hashlib.md5(title.encode('utf-8'))
title_md5 = rename_title.hexdigest()
the_path = os.path.join(files_path, title_md5)
the_path = f'{the_path}.pdf'
def download_file(link, the_path):
"""下载文件."""
try:
response = req.get(link, stream=True, timeout=500)
chunk_size = 1024
with open(the_path, "wb") as file:
for data in response.iter_content(chunk_size=chunk_size):
file.write(data)
# count += len(data)
return True
except req.RequestException:
print('下载文件失败')
return False
def convert_img(the_path, png_dir):
doc = fitz.open(the_path)
pdf_name = os.path.splitext(the_path)[0]
print(pdf_name)
print(doc.pageCount)
i = 0
for pg in range(doc.pageCount):
i = i + 1
png_name = str(i)+".png"
print(i)
png_path = os.path.join(png_dir, png_name)
page = doc[pg]
rotate = int(0)
zoom_x = 2.0
zoom_y = 2.0
trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
pm = page.get_pixmap(matrix=trans, alpha=False)
pm.save(png_path)
def get_pdf_txt(png_dir):
num = len(os.listdir(png_dir))
text =''
for i in range(num):
i = i+1
png_name = str(i) + ".png"
png_path = os.path.join(png_dir, png_name)
image = Image.open(png_path)
# pytesseract.pytesseract.tesseract_cmd = 'xxxxxxxxxxxxxxxxx'
text += pytesseract.image_to_string(image, lang='chi_sim')
os.remove(png_name)
print(text)
url = 'xxxxxxxx'
state = download_file(url, the_path)
if not os.path.isdir('./xxxx_png'):
os.mkdir('./xxxx_png')
if not os.path.isdir(f'./xxxx_png/{title}'):
os.mkdir(f'./xxxx_png/{title}')
png_dir = f'./xxxx_png/{title}'
convert_img(the_path, png_dir)
get_pdf_txt(png_dir)
根据url链接下载pdf,并提取图中文本
最新推荐文章于 2023-07-17 11:49:34 发布
该代码实现了一个从URL下载PDF文件,将其转换为PNG图像,然后使用pytesseract进行中文OCR识别的流程。文件名通过hashlib进行MD5编码以确保唯一性,整个过程涉及文件操作、网络请求和图像处理技术。
摘要由CSDN通过智能技术生成