多进程爬虫


import pymongo
import requests
import time
from multiprocessing import Process, JoinableQueue as Queue

'''
可与多线程爬虫对比着看:https://blog.csdn.net/world_in_world/article/details/130035166
'''


class TengXun:
    db = pymongo.MongoClient(host='localhost', port=27017)
    collection = db.python.tengxun

    def __init__(self):
        self.url = 'https://pbaccess.video.qq.com/trpc.vector_layout.page_view.PageService/getPage?video_appid=3000010'
        self.headers = {
            'user-agent': ''
        }
        # self.db = pymongo.MongoClient(host='localhost', port=27017)  # 初始化mongo会报错:TypeError: cannot pickle '_thread.lock' object,此时可以转为定义类属性
        # self.collection = self.db.python.tengxun
        self.payload_queue = Queue()
        self.json_queue = Queue()
        self.content_queue = Queue()

    def payload_data(self):
        for i in range(1, 10):
            data = {
                "page_context": {
                    "page_index": f"{i}"
                },
                "page_params": {
                    "page_id": "channel_list_second_page",
                    "page_type": "operation",
                    "channel_id": "100119",
                    "filter_params": "sort=75",
                    "page": f"{i}"
                },
                "page_bypass_params": {
                    "params": {
                        "page_id": "channel_list_second_page",
                        "page_type": "operation",
                        "channel_id": "100119",
                        "filter_params": "sort=75",
                        "page": f"{i}",
                        "caller_id": "3000010",
                        "platform_id": "2",
                        "data_mode": "default",
                        "user_mode": "default"
                    },
                    "scene": "operation",
                    "abtest_bypass_id": "2d0504a2a3250904"
                }
            }
            self.payload_queue.put(data)

    def get_data(self):
        while True:
            data = self.payload_queue.get()
            response = requests.post(self.url, headers=self.headers, json=data)
            result = response.json()
            self.json_queue.put(result)
            self.payload_queue.task_done()

    def parse_data(self):
        while True:
            result = self.json_queue.get()
            for one in result["data"]["CardList"][0]["children_list"]["list"]["cards"]:
                item = {}
                item['second_title'] = one['params']['second_title']
                item['series_name'] = one['params'].get('series_name', '空')
                item['timelong'] = one['params'].get('timelong', '空')
                item['title'] = one['params']['title']
                self.content_queue.put(item)
            self.json_queue.task_done()

    def save_data(self):
        while True:
            item = self.content_queue.get()
            # print(item)
            TengXun.collection.insert_one(item)
            print('插入数据成功')
            self.content_queue.task_done()

    def main(self):
        process_list = []
        p_payload_data = Process(target=self.payload_data)
        process_list.append(p_payload_data)
        for i in range(3):
            p_get_data = Process(target=self.get_data)
            process_list.append(p_get_data)
        p_parse_data = Process(target=self.parse_data)
        process_list.append(p_parse_data)
        p_save_data = Process(target=self.save_data)
        process_list.append(p_save_data)
        for p in process_list:
            p.daemon = True  # 设置守护进程
            p.start()
        # 当要开启的进程数量较多时(开启进程本就比开启线程更耗时),在系统还在开启进程时,主线程已经执行完了,进程任务都没运行,队列的计数自然为0,所以可以设置一个休眠时间
        time.sleep(7)
        for q in [self.payload_queue, self.json_queue, self.content_queue]:
            q.join()
        TengXun.db.close()


if __name__ == '__main__':
    t = TengXun()
    t.main()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值