python2.7.x+request+xpath爬取腾讯招聘信息

依然采用是requests和xpath,因为是简单爬虫的入门教程,边学边记录。
腾讯招聘的url是https://hr.tencent.com/position.php
点击下面的第2页,得到url:https://hr.tencent.com/position.php?&start=10#a
再点击第一页,得到url:https://hr.tencent.com/position.php?&start=0#a
对比可以得到url的规律,第一页start=0#a,第二页的start=10#a,那么第三页的是start=20#a,依次类推。每页有10条记录,这里只是演示,爬取前7页的信息,获取每条招聘信息的详细页面的信息。

#!/usr/bin/env python 
# -*- coding:utf-8 -*-

from lxml import etree
import requests

import sys
reload(sys)
sys.setdefaultencoding('utf-8')

BASE_DOMAIN = 'https://hr.tencent.com/'
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 '
                  '(KHTML, like Gecko) Chrome/72.0.3610.2 Mobile Safari/537.36'
}

#循环请求
def loppget(url):
    resmv = None
    try:
        resmv = requests.get(url=url, headers=HEADERS)
    except:
        resmv = loppget(url)
    return resmv

#获取详细页面的url地址
def get_detail_urls(url):
    #response = requests.get(url, headers=HEADERS)  #正常情况下,直接获取就行
    response = loppget(url)
    text = response.text
    #print response.encoding
    html = etree.HTML(text)
    detail_urls = html.xpath("//table[@class='tablelist']//td[@class='l square']/a/@href")
    detail_urls = map(lambda url: BASE_DOMAIN+url, detail_urls)
    return detail_urls

def parse_detail_page(url):
    position_detail = {}
    response = requests.get(url, headers=HEADERS)
    text = response.text
    html = etree.HTML(text)
    table_node = html.xpath("//div[@id='position_detail']//table")[0]

    title = table_node.xpath(".//tr/td/text()")[0]     #职位名称
    place = table_node.xpath(".//tr/td/text()")[1]     #地点
    position = table_node.xpath(".//tr/td/text()")[2]  #职位
    num = table_node.xpath(".//tr/td/text()")[3]       #人数

    position_detail['title'] = title
    position_detail['place'] = place
    position_detail['position'] = position
    position_detail['num'] = num[0:-1]
    print(position_detail["position"])

    #工作职责
    duty_node = table_node.xpath(".//tr/td/ul[@class='squareli']")[0]
    duty_list = duty_node.xpath(".//li/text()")
    duty = "".join(duty_list)
    #print(duty)

    #职位要求
    job_require_node = table_node.xpath(".//tr/td/ul[@class='squareli']")[1]
    job_require_list = job_require_node.xpath(".//li/text()")
    job_require = "".join(job_require_list)
    #print(job_require)

    position_detail['duty'] = duty
    position_detail['job_require'] = job_require
    return position_detail

def spider():
    base_url = 'https://hr.tencent.com/position.php?&start={}#a'
    position_details = []
    for x in range(1, 8):
        url = base_url.format(x*10)
        detal_urls = get_detail_urls(url)
        for detal_url in detal_urls:
            print(detal_url)
            position_detail = parse_detail_page(detal_url)
            position_details.append(position_detail)

#    print(position_details)
#    print(len(position_details))
#    for position_detail in position_details:
#        print position_detail['position']
#        print position_detail['duty']
#        print position_detail['num']


if __name__ == '__main__':
    spider()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值