import requests
import json
import queue
import threading
import time
class TencentSpider():
def __init__(self):
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
self.start_url = "https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1587110093130&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn"
self.detail_id = queue.Queue(300)
def request_html(self, url):
"""
请求url地址返回html字符串
:param url:
:return:
"""
response = requests.get(url, headers=self.headers)
if response.status_code == 200:
return response.text
else:
return None
def generate_detaile_id(self):
"""
生成详情页url并将其入队
:return:
"""
list_urls = []
for i in range(10):
url = self.start_url.format(i + 1)
list_urls.append(url)
for u in list_urls:
data = requests.get(u, headers=self.headers).text
data = json.loads(data)["Data"]["Posts"]
for d in data:
d_id = d["PostId"]
self.detail_id.put(d_id)
def request_detail_data(self):
"""
请求详情页json数据解析后保存到本地
:return:
"""
while not self.detail_id.empty():
d_id = self.detail_id.get()
url = "https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1587112751425&postId={}&language=zh-cn".format(d_id)
text = requests.get(url, headers=self.headers).text
data = json.loads(text)["Data"]
item = {}
item["职位名称"] = data["RecruitPostName"]
item["工作地点"] = data["LocationName"]
item["职位类别"] = data["CategoryName"]
item["产品名称"] = data["ProductName"]
item["发布时间"] = data["LastUpdateTime"]
item["工作职责"] = data["Responsibility"]
item["工作要求"] = data["Requirement"]
fp = open("./data/tencent.json", "a", encoding="utf-8")
json.dump(item, fp, ensure_ascii=False)
fp.write("\n")
fp.close()
print(item["职位名称"] + "写入成功...")
self.detail_id.task_done()
def run(self):
#设定一个子线程进行详情页url抓取
t1 = threading.Thread(target=self.generate_detaile_id)
t1.start()
#等待1秒待id队列先写入一部分元素供后续请求使用
time.sleep(1)
self.request_detail_data()
if __name__ == '__main__':
ts = TencentSpider()
ts.run()
页面分析如下图:
分析发现列表页的详情数据通过Ajax请求,找到url直接请求即可获取到对应json数据
详情页的详细数据同样通过Ajax请求获取,找到对应url为https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1587116958790&postId=1163294431143530496&language=zh-cn,至此发现职位的PostId是关键点,故首先通过请求列表页获取到postId然后构造详情页数据请求的url即可直接请求职位的详细数据,最终爬取结果如下: