python多级页面多线程爬取

最新推荐文章于 2024-02-11 22:55:13 发布

冷巷(✘_✘)

最新推荐文章于 2024-02-11 22:55:13 发布

阅读量1.4k

点赞数 4

分类专栏： python 文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/m0_66060262/article/details/122829547

版权

python 专栏收录该内容

26 篇文章 3 订阅

订阅专栏

思路

✅创建多个队列，每一个队列存放不同级别页面的url

✅分别从不同的队列中获取url地址，并找到对应的解析函数解析提取数据

✅二级队列及以上，队列中获取url地址时需要使用timeout参数

测试网址：搜索 | 腾讯招聘

可以看到网站的动态加载的，进行抓包，找到当前网页的数据包

进详情页分析一手

内容在data里面

这些都是我们要爬的

导入库

import time,requests
from threading import Thread,Lock
from queue import Queue
from fake_useragent import UserAgent
from urllib import parse

创建准备、请求响应

    def __init__(self):
        self.one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1644370587575&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={}&pageIndex=1&pageSize={}&language=zh-cn&area=cn'
        self.two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1644370614815&postId={}&language=zh-cn'
        # 创建2个队列
        self.one_q = Queue()
        self.two_q = Queue()
        # 创建1把锁
        self.lock1 = Lock()
        self.lock2 = Lock()
        # 计数
        self.number = 0   # 初始值为0
    def get_html(self, url):
        """功能函数1：获取响应内容"""
        headers = {'User-Agent':UserAgent().random}
        html = requests.get(url=url, headers=headers).json()

        return html

大页面的数据请求和职业总数

    def parse_one_page(self):
        """一级页面url入队列"""
        keyword = input('请输入职位类别：')
        keyword = parse.quote(keyword)
        total = self.get_total(keyword)
        for page in range(1, total+1):
            url = self.one_url.format(keyword, page)
            self.one_q.put(url)

    def get_total(self,keyword):
        """获取某个类别的总页数"""
        url = self.one_url.format(keyword, 1)
        html = self.get_html(url=url)
        count = html['Data']['Count']
        total = count//10 if count%10==0 else count//10 +1

        return total

大页面提取

    def parse_one_page(self):
        """一级页面解析函数：提取postid，并拼接二级页面url地址，入队列"""
        while True:
            self.lock1.acquire()
            if not self.one_q.empty():
                one_url = self.one_q.get()
                self.lock1.release()
                one_html = self.get_html(url=one_url)
                # one_html中有10个postid
                for one_job in one_html['Data']['Posts']:
                    post_id = one_job['PostId']
                    job_url = self.two_url.format(post_id)
                    # 将职位详情页链接交给二级队列
                    self.two_q.put(job_url)
            else:
                self.lock1.release()
                break

详情页面提取

    def parse_two_page(self):
        """二级页面解析函数：提取具体的职位信息"""
        while True:
            self.lock2.acquire()
            if not self.two_q.empty():
                try:
                    two_url = self.two_q.get(tuneout=1)
                    self.lock2.release()
                    two_html = self.get_html(url=two_url)
                    item = {}
                    item['name'] = two_html['Data']['RecruitPostName']
                    item['type'] = two_html['Data']['CategoryName']
                    item['address'] = two_html['Data']['LocationName']
                    item['duty'] = two_html['Data']['Responsibility']
                    item['require'] = two_html['Data']['Requirement']
                    item['time'] = two_html['Data']['LastUpdataTime']
                    print(item)
                    self.lock2.acquire()
                    self.number += 1
                    self.lock2.release()
                except Exception as e:
                    self.lock2.release()
                    break

创建线程

 def run(self):
        # 先让url地址入队列
        self.url_in()
        # 创建多线程运行
        t1_list = []
        t2_list = []
        for i in range(2):  # 创建2个线程
            t1 = Thread(target=self.parse_one_page)
            t1_list.append(t1)
            t1.start()

        for i in range(2):  # 创建2个线程
            t2 = Thread(target=self.parse_two_page)
            t2_list.append(t2)
            t2.start()

        for ti in t1_list:
            ti.join()

        for t2 in t2_list:
            t2.join()

        print('number:',self.number) # 技术请求多少页

运行函数


if __name__ == ' __main__':
    start_time = time.time()
    spider = TencentSpider()
    spider.run()
    end_time = time.time()
    print('time:%.2f' % (end_time - start_time))   # 看运行的时间

线程函数

 def parse_html(self):
        """线程事件函数：获取url，请求，解析，处理数据"""
        while True:
            # 上锁
            self.lock.acquire()
            if not self.q.empty():
                url = self.q.get()
                # 放锁
                self.lock.release()
                headers = {'User-Agent': UserAgent().random}
                html = requests.get(url=url, headers=headers)

完整代码

import time,requests
from threading import Thread,Lock
from queue import Queue
from fake_useragent import UserAgent
from urllib import parse
class TencentSpider:
    def __init__(self):
        self.one_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1644370587575&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword={}&pageIndex=1&pageSize={}&language=zh-cn&area=cn'
        self.two_url = 'https://careers.tencent.com/tencentcareer/api/post/ByPostId?timestamp=1644370614815&postId={}&language=zh-cn'
        # 创建2个队列
        self.one_q = Queue()
        self.two_q = Queue()
        # 创建1把锁
        self.lock1 = Lock()
        self.lock2 = Lock()
        # 计数
        self.number = 0   # 初始值为0
    def get_html(self, url):
        """功能函数1：获取响应内容"""
        headers = {'User-Agent':UserAgent().random}
        html = requests.get(url=url, headers=headers).json()

        return html

    def parse_one_page(self):
        """一级页面url入队列"""
        keyword = input('请输入职位类别：')
        keyword = parse.quote(keyword)
        total = self.get_total(keyword)
        for page in range(1, total+1):
            url = self.one_url.format(keyword, page)
            self.one_q.put(url)

    def get_total(self,keyword):
        """获取某个类别的总页数"""
        url = self.one_url.format(keyword, 1)
        html = self.get_html(url=url)
        count = html['Data']['Count']
        total = count//10 if count%10==0 else count//10 +1

        return total

    def parse_one_page(self):
        """一级页面解析函数：提取postid，并拼接二级页面url地址，入队列"""
        while True:
            self.lock1.acquire()
            if not self.one_q.empty():
                one_url = self.one_q.get()
                self.lock1.release()
                one_html = self.get_html(url=one_url)
                # one_html中有10个postid
                for one_job in one_html['Data']['Posts']:
                    post_id = one_job['PostId']
                    job_url = self.two_url.format(post_id)
                    # 将职位详情页链接交给二级队列
                    self.two_q.put(job_url)
            else:
                self.lock1.release()
                break



    def parse_two_page(self):
        """二级页面解析函数：提取具体的职位信息"""
        while True:
            self.lock2.acquire()
            if not self.two_q.empty():
                try:
                    two_url = self.two_q.get(tuneout=1)
                    self.lock2.release()
                    two_html = self.get_html(url=two_url)
                    item = {}
                    item['name'] = two_html['Data']['RecruitPostName']
                    item['type'] = two_html['Data']['CategoryName']
                    item['address'] = two_html['Data']['LocationName']
                    item['duty'] = two_html['Data']['Responsibility']
                    item['require'] = two_html['Data']['Requirement']
                    item['time'] = two_html['Data']['LastUpdataTime']
                    print(item)
                    self.lock2.acquire()
                    self.number += 1
                    self.lock2.release()
                except Exception as e:
                    self.lock2.release()
                    break

    def run(self):
        # 先让url地址入队列
        self.url_in()
        # 创建多线程运行
        t1_list = []
        t2_list = []
        for i in range(2):  # 创建2个线程
            t1 = Thread(target=self.parse_one_page)
            t1_list.append(t1)
            t1.start()

        for i in range(2):  # 创建2个线程
            t2 = Thread(target=self.parse_two_page)
            t2_list.append(t2)
            t2.start()

        for ti in t1_list:
            ti.join()

        for t2 in t2_list:
            t2.join()

        print('number:',self.number) # 技术请求多少页

    def parse_html(self):
        """线程事件函数：获取url，请求，解析，处理数据"""
        while True:
            # 上锁
            self.lock.acquire()
            if not self.q.empty():
                url = self.q.get()
                # 放锁
                self.lock.release()
                headers = {'User-Agent': UserAgent().random}
                html = requests.get(url=url, headers=headers)

if __name__ == ' __main__':
    start_time = time.time()
    spider = TencentSpider()
    spider.run()
    end_time = time.time()
    print('time:%.2f' % (end_time - start_time))   # 看运行的时间