python爬取腾讯招聘信息

最新推荐文章于 2024-04-30 15:00:36 发布

zjLOVEcyj

最新推荐文章于 2024-04-30 15:00:36 发布

阅读量793

点赞数 2

本文链接：https://blog.csdn.net/cyj5201314/article/details/105585449

版权

爬虫框架专栏收录该内容

33 篇文章 0 订阅

订阅专栏

import requests
import json
import queue
import threading
import time

class TencentSpider():

    def __init__(self):
        self.headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
    }
        self.start_url = "https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1587110093130&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn"
        self.detail_id = queue.Queue(300)


    def request_html(self, url):
        """
        请求url地址返回html字符串
        :param url:
        :return:
        """
        response = requests.get(url, headers=self.headers)
        if response.status_code == 200:
            return response.text
        else:
            return None


    def generate_detaile_id(self):
        """
        生成详情页url并将其入队
        :return:
        """
        list_urls = []
        for i in range(10):
            url = self.start_url.format(i + 1)
            list_urls.append(url)
        for u in list_urls:
            data = requests.get(u, headers=self.headers).text
            data = json.loads(data)["Data"]["Posts"]
            for d in data:
                d_id = d["PostId"]
                self.detail_id.put(d_id)


    def request_detail_data(self):
        """
        请求详情页json数据解析后保存到本地
        :return:
        """
        while not self.detail_id.empty():
            d_id = self.detail_id.get()
            url = "https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1587112751425&postId={}&language=zh-cn".format(d_id)
            text = requests.get(url, headers=self.headers).text
            data = json.loads(text)["Data"]
            item = {}
            item["职位名称"] = data["RecruitPostName"]
            item["工作地点"] = data["LocationName"]
            item["职位类别"] = data["CategoryName"]
            item["产品名称"] = data["ProductName"]
            item["发布时间"] = data["LastUpdateTime"]
            item["工作职责"] = data["Responsibility"]
            item["工作要求"] = data["Requirement"]
            fp = open("./data/tencent.json", "a", encoding="utf-8")
            json.dump(item, fp, ensure_ascii=False)
            fp.write("\n")
            fp.close()
            print(item["职位名称"] + "写入成功...")
            self.detail_id.task_done()


    def run(self):
        #设定一个子线程进行详情页url抓取
        t1 = threading.Thread(target=self.generate_detaile_id)
        t1.start()
        #等待1秒待id队列先写入一部分元素供后续请求使用
        time.sleep(1)
        self.request_detail_data()



if __name__ == '__main__':
    ts = TencentSpider()
    ts.run()

页面分析如下图:
在这里插入图片描述
分析发现列表页的详情数据通过Ajax请求，找到url直接请求即可获取到对应json数据

详情页的详细数据同样通过Ajax请求获取，找到对应url为https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1587116958790&postId=1163294431143530496&language=zh-cn，至此发现职位的PostId是关键点，故首先通过请求列表页获取到postId然后构造详情页数据请求的url即可直接请求职位的详细数据，最终爬取结果如下：
在这里插入图片描述

zjLOVEcyj

关注

2
点赞
踩
9

收藏

觉得还不错? 一键收藏
0
评论
python爬取腾讯招聘信息

import requestsimport jsonimport queueimport threadingimport timeclass TencentSpider(): def __init__(self): self.headers = { "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win...
复制链接

扫一扫