使用xpath爬取腾讯招聘招聘数据
# -*- coding:utf-8 -*-
import requests
from lxml import etree
class Tencent(object):
def __init__(self):
self.base_url = "https://hr.tencent.com/position.php?keywords=python&start=0#a"
self.headers = {
"Referer":"https://hr.tencent.com/position.php?keywords=python&start=10",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"
}
self.base_domain = "https://hr.tencent.com/"
def get_response(self,url):
response = requests.get(url,headers=self.headers)
data = response.content.decode("utf-8")
return data
def get_detail_url(self,data):
detail_urls = []
html = etree.HTML(data)
td_list = html.xpath("//table[@class='tablelist']//td[@class='l square']")
# tr_list = html.xpath("//table[@class='tablelist']/tr")[1:-1] 这里推荐使用这中方式获取tr标签
for td in td_list:
detail_url = td.xpath(".//a/@href")
detail_urls.append(detail_url)
next_url =self.base_domain + html.xpath("//a[@id='next']/@href")[0]
print(next_url)
return detail_urls,next_url
def get_detail_info(self,detail_url):
detail_data = self.get_response(detail_url)
html = etree.HTML(detail_data)
item = {}
item['post'] = html.xpath("//div[@class='box wcont_a']/table//tr[1]/td/text()")[0]
item['location'] = html.xpath("//div[@class='box wcont_a']/table//tr[2]/td[1]/text()")[0]
item['responsibility'] = " ".join(html.xpath("//div[@class='box wcont_a']/table//tr[3]//ul//text()"))
item['requirements'] = " ".join(html.xpath("//div[@class='box wcont_a']/table//tr[4]//ul//text()"))
print(item)
def main(self):
url =self.base_url
data = self.get_response(url)
detail_urls, next_url = self.get_detail_url(data)
for detail_url in detail_urls:
detail_url = self.base_domain + detail_url[0]
self.get_detail_info(detail_url)
while next_url is not None:
data = self.get_response(next_url)
detail_urls, next_url = self.get_detail_url(data)
for detail_url in detail_urls:
detail_url = self.base_domain + detail_url[0]
self.get_detail_info(detail_url)
if __name__ == '__main__':
Tencent().main()