scrapy 爬取腾讯招聘

最新推荐文章于 2021-09-26 22:53:59 发布

weixin_45197326

最新推荐文章于 2021-09-26 22:53:59 发布

阅读量126

点赞数

本文链接：https://blog.csdn.net/weixin_45197326/article/details/105656746

版权

# -*- coding: utf-8 -*-
import scrapy
import json

class HrSpider(scrapy.Spider):
    name = 'hr'
    allowed_domains = ['careers.tencent.com']

    one_url='https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1587436273920&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'

    two_url='https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1587444657148&postId={}&language=zh-cn'

    start_urls = [one_url.format(0)]


    def parse(self, response):
        for page in range(0,10):
            url=self.one_url.format(page)
            yield scrapy.Request(
                url=url,
                callback=self.parse_one
            )


    def parse_one(self,response):
        # print(type(response.text))
        for i in json.loads(response.text)['Data']['Posts']:
            item={}
            item['工作性质']=i['CategoryName']
            item['工作职责']=i['Responsibility'].replace('\n',' ').replace('\r',' ')
            # print(item)
            id_=i['PostId']
            yield scrapy.Request(
                url=self.two_url.format(id_),
                callback=self.parse_two,
                meta={'item':item}
            )

    def parse_two(self,response):
        item=response.meta['item']
        # print(item)
        h=json.loads(response.text)
        item['要求']=h['Data']['Requirement'].replace('\n',' ')
        yield item

weixin_45197326

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
scrapy 爬取腾讯招聘

# -*- coding: utf-8 -*-import scrapyimport jsonclass HrSpider(scrapy.Spider): name = 'hr' allowed_domains = ['careers.tencent.com'] one_url='https://careers.tencent.com/tencentcareer/...
复制链接

扫一扫