网站链接:https://login.cdpf.org.cn/uams/person.html
鼠标放到验证码上,右键“检查”
提取URL
爬虫代码:
import requests
import base64
from io import BytesIO
from PIL import Image
import os
for i in range(50):
url = "https://login.cdpf.org.cn/uams/kaptcha/validateCode?userType=4&_=" + str(1698646709676 + i)
response = requests.get(url)
data = response.json()
image_base64 = data["image"]
image_data = base64.b64decode(image_base64)
image = Image.open(BytesIO(image_data))
if not os.path.exists('result'):
os.makedirs('result')
image.save('result/' + str(1698646709676 + i) + '.png')
爬取结果:
提取验证码数字:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
import os
def preprocess_image(image):
# 转换为灰度图像
image = image.convert('L')
# 二值化
threshold = 128
table = []
for i in range(256):
if i < threshold:
table.append(0)
else:
table.append(1)
image = image.point(table, '1')
# 去噪
image = image.filter(ImageFilter.MedianFilter())
# 缩放
width, height = image.size
new_width, new_height = int(width * 0.5), int(height * 0.5)
image = image.resize((new_width, new_height), Image.ANTIALIAS)
return image
def demo():
# 获取当前路径下的result文件夹下的所有图片文件
image_files = [f for f in os.listdir('result3') if f.endswith(('.png', '.jpg', '.jpeg'))]
# 遍历图片文件,进行识别并打印结果
for image_file in image_files:
# 打开要识别的图片
image = Image.open(os.path.join('result3', image_file))
# 预处理图片
image = preprocess_image(image)
# 使用pytesseract调用image_to_string方法进行识别,传入要识别的图片,lang='chi_sim'是设置为中文识别,
text = pytesseract.image_to_string(image, lang='num', config='--psm 6 --oem 3 -c tessedit_char_whitelist=0123456789')
# 输入所识别的文字
print(f"{image_file}: {text}")
if __name__ == '__main__':
demo(