python多级页面多线程爬取

思路

✅创建多个队列,每一个队列存放不同级别页面的url

✅分别从不同的队列中获取url地址,并找到对应的解析函数解析提取数据

✅二级队列及以上,队列中获取url地址时需要使用timeout参数

测试网址: 搜索 | 腾讯招聘

可以看到网站的动态加载的,进行抓包,找到当前网页的数据包

进详情页分析一手

内容在data里面

 

 

这些都是我们要爬的

导入库

import time,requests
from threading import Thread,Lock
from queue import Queue
from fake_useragent import UserAgent
from urllib import parse

 创建准备、请求响应

    def __init__(self):
        self.one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1644370587575&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={}&pageIndex=1&pageSize={}&language=zh-cn&area=cn'
        self.two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1644370614815&postId={}&language=zh-cn'
        # 创建2个队列
        self.one_q = Queue()
        self.two_q = Queue()
        # 创建1把锁
        self.lock1 = Lock()
        self.lock2 = Lock()
        # 计数
        self.number = 0   # 初始值为0
    def get_html(self, url):
        """功能函数1:获取响应内容"""
        headers = {'User-Agent':UserAgent().random}
        html = requests.get(url=url, headers=headers).json()

        return html

大页面的数据请求和职业总数

    def parse_one_page(self):
        """一级页面url入队列"""
        keyword = input('请输入职位类别:')
        keyword = parse.quote(keyword)
        total = self.get_total(keyword)
        for page in range(1, total+1):
            url = self.one_url.format(keyword, page)
            self.one_q.put(url)

    def get_total(self,keyword):
        """获取某个类别的总页数"""
        url = self.one_url.format(keyword, 1)
        html = self.get_html(url=url)
        count = html['Data']['Count']
        total = count//10 if count%10==0 else count//10 +1

        return total

大页面提取

    def parse_one_page(self):
        """一级页面解析函数:提取postid,并拼接二级页面url地址,入队列"""
        while True:
            self.lock1.acquire()
            if not self.one_q.empty():
                one_url = self.one_q.get()
                self.lock1.release()
                one_html = self.get_html(url=one_url)
                # one_html中有10个postid
                for one_job in one_html['Data']['Posts']:
                    post_id = one_job['PostId']
                    job_url = self.two_url.format(post_id)
                    # 将职位详情页链接交给二级队列
                    self.two_q.put(job_url)
            else:
                self.lock1.release()
                break

详情页面提取

    def parse_two_page(self):
        """二级页面解析函数:提取具体的职位信息"""
        while True:
            self.lock2.acquire()
            if not self.two_q.empty():
                try:
                    two_url = self.two_q.get(tuneout=1)
                    self.lock2.release()
                    two_html = self.get_html(url=two_url)
                    item = {}
                    item['name'] = two_html['Data']['RecruitPostName']
                    item['type'] = two_html['Data']['CategoryName']
                    item['address'] = two_html['Data']['LocationName']
                    item['duty'] = two_html['Data']['Responsibility']
                    item['require'] = two_html['Data']['Requirement']
                    item['time'] = two_html['Data']['LastUpdataTime']
                    print(item)
                    self.lock2.acquire()
                    self.number += 1
                    self.lock2.release()
                except Exception as e:
                    self.lock2.release()
                    break

创建线程

 def run(self):
        # 先让url地址入队列
        self.url_in()
        # 创建多线程运行
        t1_list = []
        t2_list = []
        for i in range(2):  # 创建2个线程
            t1 = Thread(target=self.parse_one_page)
            t1_list.append(t1)
            t1.start()

        for i in range(2):  # 创建2个线程
            t2 = Thread(target=self.parse_two_page)
            t2_list.append(t2)
            t2.start()

        for ti in t1_list:
            ti.join()

        for t2 in t2_list:
            t2.join()

        print('number:',self.number) # 技术请求多少页

运行函数


if __name__ == ' __main__':
    start_time = time.time()
    spider = TencentSpider()
    spider.run()
    end_time = time.time()
    print('time:%.2f' % (end_time - start_time))   # 看运行的时间

线程函数

 def parse_html(self):
        """线程事件函数:获取url,请求,解析,处理数据"""
        while True:
            # 上锁
            self.lock.acquire()
            if not self.q.empty():
                url = self.q.get()
                # 放锁
                self.lock.release()
                headers = {'User-Agent': UserAgent().random}
                html = requests.get(url=url, headers=headers)

完整代码

import time,requests
from threading import Thread,Lock
from queue import Queue
from fake_useragent import UserAgent
from urllib import parse
class TencentSpider:
    def __init__(self):
        self.one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1644370587575&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={}&pageIndex=1&pageSize={}&language=zh-cn&area=cn'
        self.two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1644370614815&postId={}&language=zh-cn'
        # 创建2个队列
        self.one_q = Queue()
        self.two_q = Queue()
        # 创建1把锁
        self.lock1 = Lock()
        self.lock2 = Lock()
        # 计数
        self.number = 0   # 初始值为0
    def get_html(self, url):
        """功能函数1:获取响应内容"""
        headers = {'User-Agent':UserAgent().random}
        html = requests.get(url=url, headers=headers).json()

        return html

    def parse_one_page(self):
        """一级页面url入队列"""
        keyword = input('请输入职位类别:')
        keyword = parse.quote(keyword)
        total = self.get_total(keyword)
        for page in range(1, total+1):
            url = self.one_url.format(keyword, page)
            self.one_q.put(url)

    def get_total(self,keyword):
        """获取某个类别的总页数"""
        url = self.one_url.format(keyword, 1)
        html = self.get_html(url=url)
        count = html['Data']['Count']
        total = count//10 if count%10==0 else count//10 +1

        return total

    def parse_one_page(self):
        """一级页面解析函数:提取postid,并拼接二级页面url地址,入队列"""
        while True:
            self.lock1.acquire()
            if not self.one_q.empty():
                one_url = self.one_q.get()
                self.lock1.release()
                one_html = self.get_html(url=one_url)
                # one_html中有10个postid
                for one_job in one_html['Data']['Posts']:
                    post_id = one_job['PostId']
                    job_url = self.two_url.format(post_id)
                    # 将职位详情页链接交给二级队列
                    self.two_q.put(job_url)
            else:
                self.lock1.release()
                break



    def parse_two_page(self):
        """二级页面解析函数:提取具体的职位信息"""
        while True:
            self.lock2.acquire()
            if not self.two_q.empty():
                try:
                    two_url = self.two_q.get(tuneout=1)
                    self.lock2.release()
                    two_html = self.get_html(url=two_url)
                    item = {}
                    item['name'] = two_html['Data']['RecruitPostName']
                    item['type'] = two_html['Data']['CategoryName']
                    item['address'] = two_html['Data']['LocationName']
                    item['duty'] = two_html['Data']['Responsibility']
                    item['require'] = two_html['Data']['Requirement']
                    item['time'] = two_html['Data']['LastUpdataTime']
                    print(item)
                    self.lock2.acquire()
                    self.number += 1
                    self.lock2.release()
                except Exception as e:
                    self.lock2.release()
                    break

    def run(self):
        # 先让url地址入队列
        self.url_in()
        # 创建多线程运行
        t1_list = []
        t2_list = []
        for i in range(2):  # 创建2个线程
            t1 = Thread(target=self.parse_one_page)
            t1_list.append(t1)
            t1.start()

        for i in range(2):  # 创建2个线程
            t2 = Thread(target=self.parse_two_page)
            t2_list.append(t2)
            t2.start()

        for ti in t1_list:
            ti.join()

        for t2 in t2_list:
            t2.join()

        print('number:',self.number) # 技术请求多少页

    def parse_html(self):
        """线程事件函数:获取url,请求,解析,处理数据"""
        while True:
            # 上锁
            self.lock.acquire()
            if not self.q.empty():
                url = self.q.get()
                # 放锁
                self.lock.release()
                headers = {'User-Agent': UserAgent().random}
                html = requests.get(url=url, headers=headers)

if __name__ == ' __main__':
    start_time = time.time()
    spider = TencentSpider()
    spider.run()
    end_time = time.time()
    print('time:%.2f' % (end_time - start_time))   # 看运行的时间

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值