Python3 selenium获取某东平台商家信息 重点为验证码识别 此处采用了网页截图获取验证码-第三方验证码识别

此处验证码识别采取了页面截图外加 第三方识别 冰拖 网址http://www.bingtop.com
价格不是很贵,识别速度也很快 大家可以试试或者自己购买其他平台
我个人也自行编写了文字识别模块 识别率较低正在改进

# -*- coding: utf-8 -*-
"""
Created on Tue Dec 24 16:14:03 2019

@author: Administrator
"""
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 24 16:14:03 2019

@author: Administrator
"""
from selenium.webdriver.common.keys import Keys
import time
import csv
from selenium import webdriver
import random
import base64
import requests
import json
from PIL import Image
import codecs 
import os  
#(以下按照自己的实际情况填写)
#所要爬取的地区和具体产品
address = "北京"
goods = "毯子"
#第三方验证的账号密码
api_username = "130*******"
api_password = "w*********"
#页面截图所保存的文件位置
yemian_img = "C:/Users/Administrator/pachong/picture/screenshot.png"
#截取的验证码图片
yanzheng_img = "C:/Users/Administrator/pachong/picture/003.png"
'''
上方模块请按实际情况填写
'''

#浏览器静默运行
from selenium.webdriver.firefox.options import Options
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', '222.73.130.111')
profile.set_preference('network.proxy.http_port', 888) # int
profile.update_preferences()
options = Options()
options.add_argument('--headless')
#driver = webdriver.Firefox(executable_path="geckodriver",options=options,firefox_profile=profile)


#代理IP
'''
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', '222.73.130.111')
profile.set_preference('network.proxy.http_port', 888) # int
profile.update_preferences()
#driver = webdriver.Firefox(firefox_profile=profile)
'''
driver = webdriver.Firefox(executable_path="geckodriver",options=options,firefox_profile=profile)
#codevs 防止中文写入时乱码
f = codecs.open('123suning.csv','a',encoding='utf-8')
csv_writer = csv.writer(f)

#验证码识别 此处验证码识别采取了页面截图外加 第三方识别 冰拖 网址http://www.bingtop.com
def erweima(nub,yanzheng_img =yanzheng_img,yemian_img=yemian_img,api_username=api_username,api_password=api_password):
    driver.save_screenshot(yemian_img)
    api_post_url = "http://www.bingtop.com/ocr/upload/"
    #img_path = r'C:\Users\Administrator\pachong\picture\003.png'
    #截图获取验证码
    im1 = Image.open(yemian_img )
    # 对浏览器截图进行裁剪
    im = im1.crop((671, 370, 766, 396))  
    im.save(yanzheng_img)
    print("裁剪成功")
    #此处为第三方识别模块
    with open(yanzheng_img ,'rb') as pic_file:
        img64=base64.b64encode(pic_file.read())
    params = {
        "username": "%s" %api_username,
        "password": "%s" %api_password,
        "captchaData": img64,
        "captchaType": 1000
    }
    response = requests.post(api_post_url, data=params)
    dictdata=json.loads(response.text)
    # dictdata: {"code":0, "message":"", "data":{"captchaId":"1000-158201918112812","recognition":"RESULT"}}
    yzMa = dictdata['data']['recognition']
    print(nub,"验证码:",yzMa)
    im.close()
    im1.close()
    response.close()
    pic_file.close()
    os.remove(yemian_img )
    os.remove(yanzheng_img )
    print("验证码:",yzMa)
    return(yzMa)
    

#页面操作
#driver = webdriver.Firefox(executable_path="geckodriver")
driver.set_window_size(1700, 900)
#产品搜索打开京东首页
driver.get("https://www.jd.com/")
input_txt = driver.find_element_by_id("key")
input_txt.send_keys(goods)
#input_txt.send_keys(Keys.END)
driver.find_element_by_class_name("button").click()
time.sleep(random.randint(2,4))
#地区
input_txt2 = driver.find_element_by_xpath("(//input[@class='input-txt'])[3]")
input_txt2.send_keys(address)
#input_txt2.send_keys(Keys.END)
driver.find_element_by_xpath("(//a[@class='btn btn-default'])[1]").click()
time.sleep(random.randint(2,4))
#计数专用
nub = 1

#抓取店铺页面代码
shop_set = set()
for i in range(100):
    #将滚动条拖到底部 让产品全部加载出来
    driver.find_element_by_tag_name('body').send_keys(Keys.END)
    time.sleep(random.randint(3,4))
    shops = driver.find_elements_by_class_name("curr-shop")
    for shop in shops:
        shop_url = shop.get_attribute('href')
        licence_url = "https://mall.jd.com/showLicence-" + shop_url[26:]
        shop_set.add(licence_url)
        print(i,"-",shop_url,licence_url)
    if i == 100:
        break
    try:
        print('翻页')
        driver.find_element_by_class_name("pn-next").click()
    except:
        print('找不到下一页')
        break

#分别进入店铺抓取店铺信息
for shop2 in shop_set:
    driver.get(shop2)
    print(type(driver))
    time.sleep(random.randint(2,3))
    #此处用于判断验证码是否验证成功,如果不成功则二次验证
    yes = 0
    nub += 1
    bk = 0
    while yes == 0:
        #超过三次不再尝试
        bk += 1
        if bk > 3:
            break
        #验证码图片操作
        yzMa = erweima(nub)
        input_ma = driver.find_element_by_id("verifyCode")
        input_ma.send_keys(yzMa)
        input_que = driver.find_element_by_class_name("btn").click()
        time.sleep(random.randint(2,3))
        try:
            quan = driver.find_element_by_class_name("jScore").text
            company = driver.find_elements_by_class_name("noBorder")
            shop_name = company[10].text
            c_name = company[2].text
            name = company[4].text
            address = company[5].text
            print(nub,"--",c_name,"--",name,"--",address)
            csv_writer.writerow([shop_name,c_name,name,address])
            yes = 1
        except:
            print("二维码解锁失败")
f.close()
driver.quit()
#driver.close()

print("结束")

以上代码仅供交流学习使用!

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
对于验证码识别,可以结合 PythonSelenium 来实现。以下是一个简单的示例代码: ```python from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from PIL import Image from pytesseract import image_to_string # 实例化浏览器驱动 driver = webdriver.Chrome('/path/to/chromedriver') # 打开目标网页 driver.get('https://example.com') # 等待验证码加载完成 WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.XPATH, '//*[@id="captcha-image"]'))) # 定位验证码图片元素 captcha_img = driver.find_element(By.XPATH, '//*[@id="captcha-image"]') # 获取验证码图片的位置和大小 location = captcha_img.location size = captcha_img.size # 截取整个页面的屏幕截图 driver.save_screenshot('/path/to/screenshot.png') # 根据验证码图片的位置和大小,裁剪出验证码图片 left = int(location['x']) top = int(location['y']) right = int(location['x'] + size['width']) bottom = int(location['y'] + size['height']) captcha = Image.open('/path/to/screenshot.png').crop((left, top, right, bottom)) # 将验证码图片保存到本地 captcha.save('/path/to/captcha.png') # 使用 pytesseract 进行验证码识别 result = image_to_string(captcha) # 输入验证码并提交表单 captcha_input = driver.find_element(By.XPATH, '//*[@id="captcha-input"]') captcha_input.send_keys(result) submit_button = driver.find_element(By.XPATH, '//*[@id="submit-button"]') submit_button.click() ``` 上面的代码使用了 Selenium 来加载目标网页,并使用 pytesseract 库来识别验证码图片中的文字。你需要安装 Chrome 浏览器驱动(chromedriver),并将路径替换为你的实际路径。此外,你还需要安装 Pillow 和 pytesseract 这两个库。 请注意,验证码识别可能受到多种因素的影响,如验证码的复杂度、图片质量等。有些验证码可能无法通过简单的 OCR 方法进行准确识别。因此,实际应用中可能需要根据具体情况进行调整和改进。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值