此处验证码识别采取了页面截图外加 第三方识别 冰拖 网址http://www.bingtop.com
价格不是很贵,识别速度也很快 大家可以试试或者自己购买其他平台
我个人也自行编写了文字识别模块 识别率较低正在改进
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 24 16:14:03 2019
@author: Administrator
"""
# -*- coding: utf-8 -*-
"""
Created on Tue Dec 24 16:14:03 2019
@author: Administrator
"""
from selenium.webdriver.common.keys import Keys
import time
import csv
from selenium import webdriver
import random
import base64
import requests
import json
from PIL import Image
import codecs
import os
#(以下按照自己的实际情况填写)
#所要爬取的地区和具体产品
address = "北京"
goods = "毯子"
#第三方验证的账号密码
api_username = "130*******"
api_password = "w*********"
#页面截图所保存的文件位置
yemian_img = "C:/Users/Administrator/pachong/picture/screenshot.png"
#截取的验证码图片
yanzheng_img = "C:/Users/Administrator/pachong/picture/003.png"
'''
上方模块请按实际情况填写
'''
#浏览器静默运行
from selenium.webdriver.firefox.options import Options
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', '222.73.130.111')
profile.set_preference('network.proxy.http_port', 888) # int
profile.update_preferences()
options = Options()
options.add_argument('--headless')
#driver = webdriver.Firefox(executable_path="geckodriver",options=options,firefox_profile=profile)
#代理IP
'''
profile = webdriver.FirefoxProfile()
profile.set_preference('network.proxy.type', 1)
profile.set_preference('network.proxy.http', '222.73.130.111')
profile.set_preference('network.proxy.http_port', 888) # int
profile.update_preferences()
#driver = webdriver.Firefox(firefox_profile=profile)
'''
driver = webdriver.Firefox(executable_path="geckodriver",options=options,firefox_profile=profile)
#codevs 防止中文写入时乱码
f = codecs.open('123suning.csv','a',encoding='utf-8')
csv_writer = csv.writer(f)
#验证码识别 此处验证码识别采取了页面截图外加 第三方识别 冰拖 网址http://www.bingtop.com
def erweima(nub,yanzheng_img =yanzheng_img,yemian_img=yemian_img,api_username=api_username,api_password=api_password):
driver.save_screenshot(yemian_img)
api_post_url = "http://www.bingtop.com/ocr/upload/"
#img_path = r'C:\Users\Administrator\pachong\picture\003.png'
#截图获取验证码
im1 = Image.open(yemian_img )
# 对浏览器截图进行裁剪
im = im1.crop((671, 370, 766, 396))
im.save(yanzheng_img)
print("裁剪成功")
#此处为第三方识别模块
with open(yanzheng_img ,'rb') as pic_file:
img64=base64.b64encode(pic_file.read())
params = {
"username": "%s" %api_username,
"password": "%s" %api_password,
"captchaData": img64,
"captchaType": 1000
}
response = requests.post(api_post_url, data=params)
dictdata=json.loads(response.text)
# dictdata: {"code":0, "message":"", "data":{"captchaId":"1000-158201918112812","recognition":"RESULT"}}
yzMa = dictdata['data']['recognition']
print(nub,"验证码:",yzMa)
im.close()
im1.close()
response.close()
pic_file.close()
os.remove(yemian_img )
os.remove(yanzheng_img )
print("验证码:",yzMa)
return(yzMa)
#页面操作
#driver = webdriver.Firefox(executable_path="geckodriver")
driver.set_window_size(1700, 900)
#产品搜索打开京东首页
driver.get("https://www.jd.com/")
input_txt = driver.find_element_by_id("key")
input_txt.send_keys(goods)
#input_txt.send_keys(Keys.END)
driver.find_element_by_class_name("button").click()
time.sleep(random.randint(2,4))
#地区
input_txt2 = driver.find_element_by_xpath("(//input[@class='input-txt'])[3]")
input_txt2.send_keys(address)
#input_txt2.send_keys(Keys.END)
driver.find_element_by_xpath("(//a[@class='btn btn-default'])[1]").click()
time.sleep(random.randint(2,4))
#计数专用
nub = 1
#抓取店铺页面代码
shop_set = set()
for i in range(100):
#将滚动条拖到底部 让产品全部加载出来
driver.find_element_by_tag_name('body').send_keys(Keys.END)
time.sleep(random.randint(3,4))
shops = driver.find_elements_by_class_name("curr-shop")
for shop in shops:
shop_url = shop.get_attribute('href')
licence_url = "https://mall.jd.com/showLicence-" + shop_url[26:]
shop_set.add(licence_url)
print(i,"-",shop_url,licence_url)
if i == 100:
break
try:
print('翻页')
driver.find_element_by_class_name("pn-next").click()
except:
print('找不到下一页')
break
#分别进入店铺抓取店铺信息
for shop2 in shop_set:
driver.get(shop2)
print(type(driver))
time.sleep(random.randint(2,3))
#此处用于判断验证码是否验证成功,如果不成功则二次验证
yes = 0
nub += 1
bk = 0
while yes == 0:
#超过三次不再尝试
bk += 1
if bk > 3:
break
#验证码图片操作
yzMa = erweima(nub)
input_ma = driver.find_element_by_id("verifyCode")
input_ma.send_keys(yzMa)
input_que = driver.find_element_by_class_name("btn").click()
time.sleep(random.randint(2,3))
try:
quan = driver.find_element_by_class_name("jScore").text
company = driver.find_elements_by_class_name("noBorder")
shop_name = company[10].text
c_name = company[2].text
name = company[4].text
address = company[5].text
print(nub,"--",c_name,"--",name,"--",address)
csv_writer.writerow([shop_name,c_name,name,address])
yes = 1
except:
print("二维码解锁失败")
f.close()
driver.quit()
#driver.close()
print("结束")
以上代码仅供交流学习使用!