Spider学习笔记(三):抓取51招聘岗位信息

import requests
import lxml
from lxml import etree

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"
}
url = 'https://jobs.51job.com/guangzhou/p2/'

def getJob(url):
    response = requests.get(url,headers=headers).content.decode('gbk')
    mytree = lxml.etree.HTML(response)

    jobList = mytree.xpath('/html/body/div[3]/div[3]/div[1]/div[2]/div')
    for job in jobList:
        #抓取公司名字:/html/body/div[3]/div[3]/div[1]/div[2]/div[1]/p[1]/span[1]/a

        jobName = job.xpath('.//p[@class="info"]/span/a/@title')[0]
        jobName = job.xpath('./p[1]/span[1]/a/@title')[0]
        jobName = job.xpath('./p[1]/span[1]/a/text()')[0].strip()

        # 抓取公司网址:/html/body/div[3]/div[3]/div[1]/div[2]/div[1]/p[1]/span[1]/a
        jobUrl = job.xpath('./p[1]/span[1]/a/@href')[0]

        # 抓取公司地址:/html/body/div[3]/div[3]/div[1]/div[2]/div[1]/p[1]/a
        jobAddress = job.xpath('./p[1]/a/@title')[0]

        #所在城市:/html/body/div[3]/div[3]/div[1]/div[2]/div[1]/p[1]/span[2]
        jobCity = job.xpath('./p[1]/span[2]/text()')[0]

        #工作需求:/html/body/div[3]/div[3]/div[1]/div[2]/div[1]/p[2]
        jobOrder = job.xpath('./p[2]/text()')[0].strip()+','+job.xpath('p[2]/text()')[1].strip()+','+job.xpath('p[2]/text()')[2].strip()+','+job.xpath('p[2]/text()')[3].strip()

        #公司简介:/html/body/div[3]/div[3]/div[1]/div[2]/div[1]/p[3]
        jobintro = job.xpath('./p[3]/@title')[0]
        print(jobName,jobUrl,jobAddress,jobCity,jobOrder,jobintro)
              # jobUrl,jobAddress)

getJob(url)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值