依然采用是requests和xpath,因为是简单爬虫的入门教程,边学边记录。
腾讯招聘的url是https://hr.tencent.com/position.php
点击下面的第2页,得到url:https://hr.tencent.com/position.php?&start=10#a
再点击第一页,得到url:https://hr.tencent.com/position.php?&start=0#a
对比可以得到url的规律,第一页start=0#a,第二页的start=10#a,那么第三页的是start=20#a,依次类推。每页有10条记录,这里只是演示,爬取前7页的信息,获取每条招聘信息的详细页面的信息。
#!/usr/bin/env python
# -*- coding:utf-8 -*-
from lxml import etree
import requests
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
BASE_DOMAIN = 'https://hr.tencent.com/'
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/72.0.3610.2 Mobile Safari/537.36'
}
#循环请求
def loppget(url):
resmv = None
try:
resmv = requests.get(url=url, headers=HEADERS)
except:
resmv = loppget(url)
return resmv
#获取详细页面的url地址
def get_detail_urls(url):
#response = requests.get(url, headers=HEADERS) #正常情况下,直接获取就行
response = loppget(url)
text = response.text
#print response.encoding
html = etree.HTML(text)
detail_urls = html.xpath("//table[@class='tablelist']//td[@class='l square']/a/@href")
detail_urls = map(lambda url: BASE_DOMAIN+url, detail_urls)
return detail_urls
def parse_detail_page(url):
position_detail = {}
response = requests.get(url, headers=HEADERS)
text = response.text
html = etree.HTML(text)
table_node = html.xpath("//div[@id='position_detail']//table")[0]
title = table_node.xpath(".//tr/td/text()")[0] #职位名称
place = table_node.xpath(".//tr/td/text()")[1] #地点
position = table_node.xpath(".//tr/td/text()")[2] #职位
num = table_node.xpath(".//tr/td/text()")[3] #人数
position_detail['title'] = title
position_detail['place'] = place
position_detail['position'] = position
position_detail['num'] = num[0:-1]
print(position_detail["position"])
#工作职责
duty_node = table_node.xpath(".//tr/td/ul[@class='squareli']")[0]
duty_list = duty_node.xpath(".//li/text()")
duty = "".join(duty_list)
#print(duty)
#职位要求
job_require_node = table_node.xpath(".//tr/td/ul[@class='squareli']")[1]
job_require_list = job_require_node.xpath(".//li/text()")
job_require = "".join(job_require_list)
#print(job_require)
position_detail['duty'] = duty
position_detail['job_require'] = job_require
return position_detail
def spider():
base_url = 'https://hr.tencent.com/position.php?&start={}#a'
position_details = []
for x in range(1, 8):
url = base_url.format(x*10)
detal_urls = get_detail_urls(url)
for detal_url in detal_urls:
print(detal_url)
position_detail = parse_detail_page(detal_url)
position_details.append(position_detail)
# print(position_details)
# print(len(position_details))
# for position_detail in position_details:
# print position_detail['position']
# print position_detail['duty']
# print position_detail['num']
if __name__ == '__main__':
spider()