Python Selenium 爬虫通过network抓包response获取新增职位信息,并定时推送给邮箱

Python Selenium 爬虫通过network抓包response获取新增职位信息,并定时推送给邮箱

背景

在获取某些网站的招聘信息时,没有给出岗位的发布时间信息,导致用户无法识别每天新增的职位信息,大量的职位混杂一起,无法识别热点职位,错失最新的招募信息.

问题

通过对信息发布网站调研发现.
在网页上的信息是不全的,通过Chrome开发者工具找到最详细的数据是在network种的Fetch/XHR请求的返回体,所以用传统的find element by Xpath具有一定的局限性,这里利用selenium的performance日志能抓取network的包返回信息,从而得到需要的数据.

解决方案

  1. 通过selenium performance日志抓取所有Method = Network.responseReceived 的返回.
  2. 拉取到返回包的RequestID,并通过driver.execute_cdp_cmd以RequestId作为参数,重新请求获得RequestBody.
  3. 将所有的职位信息解析并落库.
  4. 按照时间切片,每天统计出相比于昨天新增的岗位信息.
  5. 通过Crontab定时发送邮件,每日推送.

代码

import time
import pymysql
import json
import datetime

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import smtplib
from email.mime.text import MIMEText
from email.header import Header




def spiderMain():
    db = pymysql.connect(host='localhost', user='root', password='', database='databases', port=3306)
    cursor = db.cursor()
    cursor.execute("select * from bytedance where create_date = '%s'" % datetime.date.today())
    data = cursor.fetchone()
    if data is not None:
        print('今天已经存在数据,直接发邮件... ')
        return
    print( ' 今天没有数据 开始爬数据... ')

    options = Options()
    options.set_capability('goog:loggingPrefs', {'performance': 'ALL'})

    # 打开浏览器
    driver = webdriver.Chrome(options=options)

    def process_browser_log_entry(entry):
        response = json.loads(entry['message'])['message']
        return response

    driver.get(
        '需要爬取的URL'
    )
    time.sleep(10)
    browser_log = driver.get_log('performance')

    events = [process_browser_log_entry(entry) for entry in browser_log]

    events = [event for event in events
              if 'Network.responseReceived' in event['method']
              and 'response' in event['params'].keys()
              and 'posts' in event['params']['response']['url']
              ]

    resp = driver.execute_cdp_cmd('Network.getResponseBody', {'requestId': events[0]['params']['requestId']})
    # print(resp)
    bodyjson = json.loads(resp['body'])

    for jobAttrs in bodyjson['data']['job_post_list']:
        uuid = jobAttrs['id']
        jobUrl = 'https://jobs.?.com/experienced/position/' + uuid + '/detail'
        job_name = jobAttrs['title']
        description = jobAttrs['description']
        requirement = jobAttrs['requirement']
        # print(uuid + job_name + description + requirement)
        sql = f'insert into bytedance (`uuid`,`name`,`description`,url,job_attr,jd,create_date) values (%s,%s,%s,%s,%s,%s,%s)'
        today = datetime.date.today()
        cursor.execute(sql, (uuid, job_name, description, jobUrl, "", requirement, today))

    db.commit()
    db.close()
    driver.quit()

def getTodayNewDatum():
    db = pymysql.connect(host='localhost', user='root', password='', database='databases', port=3306)
    cursor = db.cursor()
    sql_today = "select * from bytedance where create_date = '%s'" % datetime.date.today()
    sql_yes = "select * from bytedance where create_date = '%s'" % getYesterday()
    cursor.execute(sql_today)
    data_today = cursor.fetchall()
    cursor.execute(sql_yes)
    data_yes = cursor.fetchall()
    print(len(data_today))
    print(len(data_yes))
    list_yeskey = []
    for data_y in data_yes:
        list_yeskey.append(data_y[1] + data_y[2])

    ret = []
    for data_t in data_today:
        if (data_t[1] + data_t[2] not in list_yeskey):
            ret.append(data_t)
    return ret

def getYesterday():
    today = datetime.date.today()
    oneday = datetime.timedelta(days=1)
    yesterday = today - oneday
    return yesterday

def sendMail(datum):
    #设置服务器所需信息
    #163邮箱服务器地址
    mail_host = 'smtp.163.com'
    #163用户名
    mail_user = ''
    #密码(部分邮箱为授权码)
    mail_pass = ''
    #邮件发送方邮箱地址
    sender = ''
    #邮件接受方邮箱地址,注意需要[]包裹,这意味着你可以写多个邮件地址群发
    receivers = ['']


    mail_msg = """
    <p> 统计的 ❤️ 新增岗位详情</p>
    """


    for data in datum:
         subStr = ""
         jobHerf = data[8]
         jobName = data[2]
         subStr = subStr + '<p><a href=\"%s\">%s</a></p>'%(jobHerf,jobName)
         subStr+='<p><p>---job description---</p>'
         jobDes = data[3]
         for descri in jobDes.split('\n'):
             subStr+='<li>%s</li>'%(descri)
         subStr+='</p>'

         subStr += '<p><p>---job requirement---</p>'
         jobJd = data[4]
         for jd in jobJd.split('\n'):
             subStr += '<li>%s</li>' % (jd)
         subStr += '</p>'

         mail_msg = mail_msg + subStr
    message = MIMEText(mail_msg, 'html', 'utf-8')
    #邮件主题
    subject = 'ByteDance 今日新增产品职位-深圳'
    message['Subject'] = Header(subject, 'utf-8')
    #发送方信息
    message['From'] = sender
    #接受方信息
    message['To'] = receivers[0]
    #登录并发送邮件
    try:
        smtpObj = smtplib.SMTP()
        #连接到服务器
        smtpObj.connect(mail_host,25)
        #登录到服务器
        smtpObj.login(mail_user,mail_pass)
        #发送
        smtpObj.sendmail(
            sender,receivers,message.as_string())
        #退出
        smtpObj.quit()
        print('success')
    except smtplib.SMTPException as e:
        print('error',e) #打印错误


spiderMain()
datum = getTodayNewDatum()
sendMail(datum)

效果

最终发送邮件的效果

  • 10
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值