Python3 selenium获取某东平台商家信息重点为验证码识别此处采用了网页截图获取验证码-第三方验证码识别

最新推荐文章于 2024-07-24 23:49:50 发布

菜鸟挣扎史

最新推荐文章于 2024-07-24 23:49:50 发布

阅读量178

点赞数

分类专栏：爬虫文章标签： python selenium

本文链接：https://blog.csdn.net/qq_39503451/article/details/103735339

版权

爬虫专栏收录该内容

5 篇文章 0 订阅

订阅专栏

此处验证码识别采取了页面截图外加第三方识别冰拖网址http://www.bingtop.com
价格不是很贵,识别速度也很快大家可以试试或者自己购买其他平台
我个人也自行编写了文字识别模块识别率较低正在改进

# -*- coding: utf-8 -*-
"""
Created on Tue Dec 24 16:14:03 2019

@author: Administrator
"""
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 24 16:14:03 2019

@author: Administrator
"""
from selenium.webdriver.common.keys import Keys
import time
import csv
from selenium import webdriver
import random
import base64
import requests
import json
from PIL import Image
import codecs 
import os  
#(以下按照自己的实际情况填写)
#所要爬取的地区和具体产品
address = "北京"
goods = "毯子"
#第三方验证的账号密码
api_username = "130*******"
api_password = "w*********"
#页面截图所保存的文件位置
yemian_img = "C:/Users/Administrator/pachong/picture/screenshot.png"
#截取的验证码图片
yanzheng_img = "C:/Users/Administrator/pachong/picture/003.png"
'''
上方模块请按实际情况填写
'''

#浏览器静默运行
from selenium.webdriver.firefox.options import Options
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', '222.73.130.111')
profile.set_preference('network.proxy.http_port', 888) # int
profile.update_preferences()
options = Options()
options.add_argument('--headless')
#driver = webdriver.Firefox(executable_path="geckodriver",options=options,firefox_profile=profile)


#代理IP
'''
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', '222.73.130.111')
profile.set_preference('network.proxy.http_port', 888) # int
profile.update_preferences()
#driver = webdriver.Firefox(firefox_profile=profile)
'''
driver = webdriver.Firefox(executable_path="geckodriver",options=options,firefox_profile=profile)
#codevs 防止中文写入时乱码
f = codecs.open('123suning.csv','a',encoding='utf-8')
csv_writer = csv.writer(f)

#验证码识别 此处验证码识别采取了页面截图外加 第三方识别 冰拖 网址http://www.bingtop.com
def erweima(nub,yanzheng_img =yanzheng_img,yemian_img=yemian_img,api_username=api_username,api_password=api_password):
    driver.save_screenshot(yemian_img)
    api_post_url = "http://www.bingtop.com/ocr/upload/"
    #img_path = r'C:\Users\Administrator\pachong\picture\003.png'
    #截图获取验证码
    im1 = Image.open(yemian_img )
    # 对浏览器截图进行裁剪
    im = im1.crop((671, 370, 766, 396))  
    im.save(yanzheng_img)
    print("裁剪成功")
    #此处为第三方识别模块
    with open(yanzheng_img ,'rb') as pic_file:
        img64=base64.b64encode(pic_file.read())
    params = {
        "username": "%s" %api_username,
        "password": "%s" %api_password,
        "captchaData": img64,
        "captchaType": 1000
    }
    response = requests.post(api_post_url, data=params)
    dictdata=json.loads(response.text)
    # dictdata: {"code":0, "message":"", "data":{"captchaId":"1000-158201918112812","recognition":"RESULT"}}
    yzMa = dictdata['data']['recognition']
    print(nub,"验证码:",yzMa)
    im.close()
    im1.close()
    response.close()
    pic_file.close()
    os.remove(yemian_img )
    os.remove(yanzheng_img )
    print("验证码:",yzMa)
    return(yzMa)
    

#页面操作
#driver = webdriver.Firefox(executable_path="geckodriver")
driver.set_window_size(1700, 900)
#产品搜索打开京东首页
driver.get("https://www.jd.com/")
input_txt = driver.find_element_by_id("key")
input_txt.send_keys(goods)
#input_txt.send_keys(Keys.END)
driver.find_element_by_class_name("button").click()
time.sleep(random.randint(2,4))
#地区
input_txt2 = driver.find_element_by_xpath("(//input[@class='input-txt'])[3]")
input_txt2.send_keys(address)
#input_txt2.send_keys(Keys.END)
driver.find_element_by_xpath("(//a[@class='btn btn-default'])[1]").click()
time.sleep(random.randint(2,4))
#计数专用
nub = 1

#抓取店铺页面代码
shop_set = set()
for i in range(100):
    #将滚动条拖到底部 让产品全部加载出来
    driver.find_element_by_tag_name('body').send_keys(Keys.END)
    time.sleep(random.randint(3,4))
    shops = driver.find_elements_by_class_name("curr-shop")
    for shop in shops:
        shop_url = shop.get_attribute('href')
        licence_url = "https://mall.jd.com/showLicence-" + shop_url[26:]
        shop_set.add(licence_url)
        print(i,"-",shop_url,licence_url)
    if i == 100:
        break
    try:
        print('翻页')
        driver.find_element_by_class_name("pn-next").click()
    except:
        print('找不到下一页')
        break

#分别进入店铺抓取店铺信息
for shop2 in shop_set:
    driver.get(shop2)
    print(type(driver))
    time.sleep(random.randint(2,3))
    #此处用于判断验证码是否验证成功,如果不成功则二次验证
    yes = 0
    nub += 1
    bk = 0
    while yes == 0:
        #超过三次不再尝试
        bk += 1
        if bk > 3:
            break
        #验证码图片操作
        yzMa = erweima(nub)
        input_ma = driver.find_element_by_id("verifyCode")
        input_ma.send_keys(yzMa)
        input_que = driver.find_element_by_class_name("btn").click()
        time.sleep(random.randint(2,3))
        try:
            quan = driver.find_element_by_class_name("jScore").text
            company = driver.find_elements_by_class_name("noBorder")
            shop_name = company[10].text
            c_name = company[2].text
            name = company[4].text
            address = company[5].text
            print(nub,"--",c_name,"--",name,"--",address)
            csv_writer.writerow([shop_name,c_name,name,address])
            yes = 1
        except:
            print("二维码解锁失败")
f.close()
driver.quit()
#driver.close()

print("结束")

以上代码仅供交流学习使用!

菜鸟挣扎史

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python3 selenium获取某东平台商家信息重点为验证码识别此处采用了网页截图获取验证码-第三方验证码识别

此处验证码识别采取了页面截图外加第三方识别冰拖网址http://www.bingtop.com价格不是很贵,识别速度也很快大家可以试试或者自己购买其他平台我个人也自行编写了文字识别模块识别率较低正在改进# -*- coding: utf-8 -*-"""Created on Tue Dec 24 16:14:03 2019@author: Administrator"""...
复制链接

扫一扫