selenium和Python3.6实现招聘狗网站自动识别验证码登录、列表页、详情页爬取

      之所以选择selenium实现登录主要是为了处理验证码,招聘狗网站的验证码图片是拼接出来的,所以我的方法是通过webdriver截图来实现,然后通过打码兔平台获取验证码坐标实现自动自动登录。列表页和详情页用requests库实现。具体实现过程如下:

招聘狗的验证码如下:


       首先你得注册一个账号,可以跳过企业验证,招聘狗网站是给企业HR使用的,所以一般要求企业验证,这里我们直接跳过企业验证,下面是实现过程,有详细注释:

import json
import os
import random
import re
import sys
import traceback
import time

from PIL import Image
from lxml import html as lxml_html
import selenium
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver import ActionChains

import requests
import base64
from requests.exceptions import ConnectionError
import http.cookiejar
import logging
from dama2_API import Dama2API
#随机获取useragent的第三方库
from fake_useragent import UserAgent
ua = UserAgent()





class RTC_zhaopingou(object):
    def __init__(self, account: dict, debug=False, visible=-1, last_try=False):
        assert account['user_id']
        assert account['password']

        logging.info('Change webdriver to FireFox')
        #创建seeion对象,爬取列表页和详情页使用
        self.session = requests.Session()
        self.session.headers = {
            'Host': "qiye.zhaopingou.com",
            "Origin":"http://qiye.zhaopingou.com",
            "Referer":"http://qiye.zhaopingou.com",
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.62 Safari/537.36",
        }
        #需要注册打码兔账号,从打码兔平台下载代码
        self.dama2 = Dama2API()


    def login(self):

        l = logging
        l.info("Processing Login...")

        self.driver = webdriver.Firefox()
        self.driver.set_window_size(1920, 1080)
        self.driver.implicitly_wait(10)
        driver = self.driver

        # login_url = 'http://qiye.zhaopingou.com/zhaopingou_interface/security_login?timestamp='+str(int(time.time()*1000))
        login_url = 'http://qiye.zhaopingou.com/'
        driver.get(login_url)
        #打开页面后出现的需要选择城市
        driver.find_element_by_xpath('//div[@class="city-now citys"]').click()
        #找到用户名和密码元素,模仿人手动输入
        for i in self.account['username']:
            driver.find_element_by_xpath('//input[@placeholder="请输入手机号/邮箱/狗狗号"]').send_keys(i)
            time.sleep(random.uniform(0.2,0.8))
        for j in self.account['password']:
            driver.find_element_by_xpath('//input[@placeholder="请输入密码"]').send_keys(j)
            time.sleep(random.uniform(0.2, 0.8))

        # 获取弹出验证码的按钮元素,这里有一个坑,按钮元素在iframe节点中,不能直接获取,需要通过driver.find_element_by_tag_name("iframe")切入到第一个iframe中,然后在通过xpath获取按钮元素
        # iframe = driver.find_element_by_id('captcha_widget_aiwaylekc')
        driver.switch_to.frame(driver.find_element_by_tag_name("iframe"))
        # driver.switch_to.frame('captcha_widget_aiwaylekc')
        driver.find_element_by_xpath('//span[@class="captcha-widget-text"]').click()

        #等待5秒,避免出现有时候还未加载出来的情况,通过driver.switch_to.default_content()从iframe切换到主html页面
        time.sleep(5)
        driver.switch_to.default_content()

        #点击弹出验证码按钮后出现一个新的iframe,此时有两个iframe,并列的,从这页面切入到第二个iframe
        driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[1])
        # 验证码区域
        captcha_xpath = '//div[@class="lc-panel"]'
        # captcha_xpath = '#l-captcha-float_aiwaylekc'
        re = self._login_process_captcha(captcha_xpath)
        #登录成功
        if re:
            driver.switch_to.default_content()
            driver.find_element_by_id('form_login').click()
            time.sleep(3)
            current_url = driver.current_url
            #判断登录后的url是否是期望值
            expect_url = 'http://qiye.zhaopingou.com/'
            if current_url==expect_url:
                l.info('login sucess!!!')
                #获取cookie,并将cookie保存到session中,以便爬虫列表页和详情页使用
                cookie = dict()
                print(driver.get_cookies())
                for item in driver.get_cookies():
                    # cookie += "; {}={}".format(item['name'], item["value"])
                    cookie[item['name']] = item['value']
                    if item['name'] == 'hrkeepToken':
                        self.token = item['value']
                # 存储cookie
                self.session.cookies = requests.utils.cookiejar_from_dict(cookie, self.cookiejar)
                l.info("get cookie: {}".format(cookie))
                #登录成功,退出driver,后面不使用了
                self.driver.quit()
                return True
        else:
            l.info('login failed due to CAPTCHA, submit_count')
            return False



    def _login_process_captcha(self,captcha_xpath):
        l = logging
        driver = self.driver
        captcha_element = driver.find_element_by_xpath(captcha_xpath)
        #验证码坐标和大小
        offset = captcha_element.location
        print('offset:',offset)
        size = captcha_element.size

        # 验证码接口
        dama2 = self.dama2

        #保存验证码图片
        shm_dir = r'/tmp/zhaopingou/'
        if os.path.exists(shm_dir) is False:
            os.makedirs(shm_dir)
        captcha_img_path = os.path.join(shm_dir, 'captcha_img_{user_id}.png'.format(user_id=self.account['user_id']))
        maximum = 20
        attempt = 0
        while attempt<=maximum:
            l.info(f'Trying to decode CAPTCHA: {attempt}/{maximum}')

            #验证码元素
            captcha_element = driver.find_element_by_xpath(captcha_xpath)
            #截取验证码图片保存到captcha_img_path
            captcha_element.screenshot(captcha_img_path)

            try:
                #调用打码兔接口,传入验证码类型,验证码图片文件,返回坐标值coordinate_list
                captcha_id, coordinate_list = dama2.decode_captcha(captcha_type=6137, file_path=captcha_img_path)
                l.info(f'coordinate_list:{coordinate_list}')
            except Exception as err:
                err_str = str(err)
                tb = traceback.format_exc()
                msg = f'Exception occurred when decode CAPTCHA, err: {err_str}, tb:\n{tb}'
                l.warning(msg)
                attempt+=1
                # 发生异常时先返回主页面
                continue
            #将鼠标移动到返回的坐标位置并点击
            for xy in coordinate_list:
                action = ActionChains(driver)
                action.move_to_element_with_offset(captcha_element, xy[0], xy[1]).click()
                action.perform()
                time.sleep(random.uniform(0.5,2))
            #先切回到主html,再切到第一个iframe,获取之前的弹出验证按钮,判断内容是否是验证成功
            driver.switch_to.default_content()
            driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[0])
            text = driver.find_element_by_xpath('//span[@class="captcha-widget-text"]').text
            if text.find('验证成功')!=-1:
                l.info('验证码验证成功!')
                time.sleep(random.uniform(1,2))
                return True
            else:   #失败则再切回到第二个iframe,从新获取验证码
                driver.switch_to.default_content()
                driver.switch_to.frame(driver.find_elements_by_tag_name("iframe")[1])
                l.info('fail,and try it again')
                attempt+=1
                time.sleep(2)
                continue
        return False

    #通过搜索关键字获取列表页面,并定位到某一页
    def search(self, keyword, page_to_go):
        '''搜索简历,得到列表页面,数据为json格式'''
        l = logging

        assert keyword
        self.keyword = keyword
        # 使用firefox浏览器抓取post请求参数
        params = {
            "pageSize":page_to_go,
            "pageNo":"25",
            "keyStr":keyword,
            "companyName":"",
            "schoolName":"",
            "keyStrPostion":"",
            "postionStr":"",
            "startDegrees":"-1",
            "endDegress":"-1",
            "startAge":"0",
            "endAge":"0",
            "gender":"-1",
            "region":"",
            "timeType":"-1",
            "startWorkYear":"-1",
            "endWorkYear":"-1",
            "beginTime":"",
            "endTime":"",
            "isMember":"-1",
            "hopeAdressStr":"",
            "cityId":"-1",
            "updateTime":"",
            "tradeId":"",
            "clientNo":"",
            "userToken":self.token,
            "clientType":"2"
        }

        retry = 0

        while True:
            #抓包获取请求的真实URL,后面是随机的数字字符串
            search_url = "http://qiye.zhaopingou.com/zhaopingou_interface/find_warehouse_by_position_new?timestamp=" + str(int(time.time() * 1000))
            l.info('search_url:{}'.format(search_url))
            self.current_url = search_url
            l.debug(f'Open search page. url,params,keyword,userToken: {search_url},{params},{keyword},{self.token}')

            retry += 1
            if retry == 11:
                return ''
            try:
                #使用session请求
                res = self.session.post(search_url, data=params)
            except ConnectionError:
                l.info("ConnectionError! Sleep 5 minutes and retry...")
                time.sleep(300)
                self.current_url = search_url
                continue
            else:
                l.info('current url is:{}'.format(res.url))
                if res.url != search_url:
                    login_result = self.login(load=False)
                    if login_result:
                        continue
                    else:
                        l.warning("Login failed!")
                        sys.exit('login failed')
                elif not res.text:
                    l.info("Service is busy. Wait 5 minutes and retry...")
                    time.sleep(300)
                    l.info('Continue Searching...')
                    continue
                #返回的数据异常,内容很少
                elif len(str(res.text))<2000:
                    #若返回‘请您登录后查看简历’,则重新登录后在爬取
                    if '请您登录后查看简历' in str(res.text):
                        self.login(load=False)
                        continue
                    result = str(res.text)
                    #更换useragent
                    self.session.headers['User-Agent'] = ua.firefox
                    l.info(f'errorcode msg:{result}')
                    l.info('Too frequent operation, please try again in a minute')
                    time.sleep(random.randint(61,100))
                    continue
                else:
                    try:
                        #返回的正常数据,通过json.dumps()获取json数据
                        resume_list = json.loads(res.text)
                        resume_list["current_page"]=page_to_go
                        # 在列表页面加入搜索页面
                        res = json.dumps(resume_list,ensure_ascii=False)
                        l.info(f'search_resume_list_info:{res}')
                        return res
                    except:
                        l.warning(res.text)
                        l.warning("something wrong!sleep 5 minutes and retry...")
                        time.sleep(300)
                        continue

    def open_resume(self, url):
        '''
        打开简历,得到详情页面
        url可通过base64加密的用户id构造
        '''
        l = logging

        l.debug(f'Open a resume: request_url: {url}')
        resumeHtmlId=(url.split("="))[1]
        # 设置前链
        #self.session.headers['Referer'] = "http://qiye.zhaopingou.com/resume?key="+self.keyword
        # 抓包获取简历详情页的请求参数
        open_resume_data={
            "resumeHtmlId": resumeHtmlId,
            "keyStr":"",
            "keyPositionName":"",
            "tradeId":"",
            "postionStr":"",
            "jobId":"0",
            "companyName":"",
            "schoolName":"",
            "clientNo":"",
            "userToken":self.token,
            "clientType":"2"
        }
        retry = 0
        while True:
            #抓包获取详情页真实url
            openresumeurl = "http://qiye.zhaopingou.com/zhaopingou_interface/zpg_find_resume_html_details?timestamp=" + str(int(time.time() * 1000))
            l.info('resume_url:{}'.format(openresumeurl))

            retry += 1
            if retry == 11:
                return ''
            try:
                res = self.session.post(url=openresumeurl,data=open_resume_data)
            except ConnectionError:
                l.info("ConnectionError! Sleep 5 minutes and retry...")
                time.sleep(300)
                continue
            else:
                # 返回的html页面
                l.info('current url is:{}'.format(res.url))
                if res.url != openresumeurl:
                    l.info("cookie is invalid. Login with webdriver")
                    login_result = self.login(load=False)
                    if login_result:
                        continue
                    else:
                        l.warning("Login failed!")
                        sys.exit('login failed')
                if not res.text:
                    l.info("Service is busy. Wait 5 minutes and retry...")
                    time.sleep(300)
                    continue
                elif len(str(res.text))<2000:
                    print('errorcode:',res.text)
                    result = str(res.text)
                    l.info(f'errorcode msg:{result}')
                    l.info('Too frequent operation, please try again in a minute')
                    time.sleep(random.randint(61, 100))
                    continue
                else:
                    try:
                        page_len = len(res.text)
                        self.current_url = openresumeurl
                        l.info(f'Downloaded a resume, len: {page_len:,d}, current_url: {url}')
                        resp_json=json.loads(res.text)
                        res_utf=json.dumps(resp_json,ensure_ascii=False)
                        return res_utf
                    except:
                        l.warning(res.text)
                        l.warning("something wrong! sleep 5 minutes and retry...")
                        time.sleep(300)
                        continue


if __name__ == '__main__':
    #账号密码是假的,大家填写自己的账号密码
    rtc_zhaopingou = RTC_zhaopingou(account={'user_id': '-701', 'username': '13419696888', 'password': '123'},
                          debug=False,
                          visible=1, last_try=False)
    rtc_zhaopingou.login()
    keyword_list = ['python','大数据','人工智能','java']
    for kw in keyword_list:
        for i in range(1,200):
            search_result = rtc_zhaopingou.search(kw, i)
            print('****************************************************************')

    res = rtc_zhaopingou.open_resume(' http://qiye.zhaopingou.com/resume/detail?resumeId=5761920')
    print(res)
打码兔平台的代码需要自己下载,放在同级目录后可以跑一下
阅读更多
想对作者说点什么?

博主推荐

换一批

没有更多推荐了,返回首页