使用xpath爬取腾讯招聘python岗位信息

使用xpath爬取腾讯招聘招聘数据

# -*- coding:utf-8 -*-
import requests
from lxml import etree

class Tencent(object):
    def __init__(self):
        self.base_url = "https://hr.tencent.com/position.php?keywords=python&start=0#a"
        self.headers = {
            "Referer":"https://hr.tencent.com/position.php?keywords=python&start=10",
            "User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
        }
        self.base_domain = "https://hr.tencent.com/"

    def get_response(self,url):
        response = requests.get(url,headers=self.headers)
        data = response.content.decode("utf-8")
        return data

    def get_detail_url(self,data):
        detail_urls = []
        html = etree.HTML(data)
        td_list = html.xpath("//table[@class='tablelist']//td[@class='l square']")
        # tr_list = html.xpath("//table[@class='tablelist']/tr")[1:-1]   这里推荐使用这中方式获取tr标签
        for td in td_list:
            detail_url = td.xpath(".//a/@href")
            detail_urls.append(detail_url)
        next_url =self.base_domain +  html.xpath("//a[@id='next']/@href")[0]
        print(next_url)
        return detail_urls,next_url

    def get_detail_info(self,detail_url):
        detail_data = self.get_response(detail_url)
        html = etree.HTML(detail_data)
        item = {}
        item['post'] = html.xpath("//div[@class='box wcont_a']/table//tr[1]/td/text()")[0]
        item['location'] =  html.xpath("//div[@class='box wcont_a']/table//tr[2]/td[1]/text()")[0]
        item['responsibility'] = " ".join(html.xpath("//div[@class='box wcont_a']/table//tr[3]//ul//text()"))
        item['requirements'] = " ".join(html.xpath("//div[@class='box wcont_a']/table//tr[4]//ul//text()"))
        print(item)

    def main(self):
        url =self.base_url
        data = self.get_response(url)
        detail_urls, next_url = self.get_detail_url(data)
        for detail_url in detail_urls:
            detail_url = self.base_domain + detail_url[0]
            self.get_detail_info(detail_url)
        while next_url is not  None:
            data = self.get_response(next_url)
            detail_urls, next_url = self.get_detail_url(data)
            for detail_url in detail_urls:
                detail_url = self.base_domain + detail_url[0]
                self.get_detail_info(detail_url)

if __name__ == '__main__':
    Tencent().main()
  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值