import pymongo
import requests
import time
from multiprocessing import Process, JoinableQueue as Queue
'''
可与多线程爬虫对比着看:https://blog.csdn.net/world_in_world/article/details/130035166
'''
class TengXun:
db = pymongo.MongoClient(host='localhost', port=27017)
collection = db.python.tengxun
def __init__(self):
self.url = 'https://pbaccess.video.qq.com/trpc.vector_layout.page_view.PageService/getPage?video_appid=3000010'
self.headers = {
'user-agent': ''
}
# self.db = pymongo.MongoClient(host='localhost', port=27017) # 初始化mongo会报错:TypeError: cannot pickle '_thread.lock' object,此时可以转为定义类属性
# self.collection = self.db.python.tengxun
self.payload_queue = Queue()
self.json_queue = Queue()
self.content_queue = Queue()
def payload_data(self):
for i in range(1, 10):
data = {
"page_context": {
"page_index": f"{i}"
},
"page_params": {
"page_id": "channel_list_second_page",
"page_type": "operation",
"channel_id": "100119",
"filter_params": "sort=75",
"page": f"{i}"
},
"page_bypass_params": {
"params": {
"page_id": "channel_list_second_page",
"page_type": "operation",
"channel_id": "100119",
"filter_params": "sort=75",
"page": f"{i}",
"caller_id": "3000010",
"platform_id": "2",
"data_mode": "default",
"user_mode": "default"
},
"scene": "operation",
"abtest_bypass_id": "2d0504a2a3250904"
}
}
self.payload_queue.put(data)
def get_data(self):
while True:
data = self.payload_queue.get()
response = requests.post(self.url, headers=self.headers, json=data)
result = response.json()
self.json_queue.put(result)
self.payload_queue.task_done()
def parse_data(self):
while True:
result = self.json_queue.get()
for one in result["data"]["CardList"][0]["children_list"]["list"]["cards"]:
item = {}
item['second_title'] = one['params']['second_title']
item['series_name'] = one['params'].get('series_name', '空')
item['timelong'] = one['params'].get('timelong', '空')
item['title'] = one['params']['title']
self.content_queue.put(item)
self.json_queue.task_done()
def save_data(self):
while True:
item = self.content_queue.get()
# print(item)
TengXun.collection.insert_one(item)
print('插入数据成功')
self.content_queue.task_done()
def main(self):
process_list = []
p_payload_data = Process(target=self.payload_data)
process_list.append(p_payload_data)
for i in range(3):
p_get_data = Process(target=self.get_data)
process_list.append(p_get_data)
p_parse_data = Process(target=self.parse_data)
process_list.append(p_parse_data)
p_save_data = Process(target=self.save_data)
process_list.append(p_save_data)
for p in process_list:
p.daemon = True # 设置守护进程
p.start()
# 当要开启的进程数量较多时(开启进程本就比开启线程更耗时),在系统还在开启进程时,主线程已经执行完了,进程任务都没运行,队列的计数自然为0,所以可以设置一个休眠时间
time.sleep(7)
for q in [self.payload_queue, self.json_queue, self.content_queue]:
q.join()
TengXun.db.close()
if __name__ == '__main__':
t = TengXun()
t.main()
多进程爬虫
于 2023-04-08 21:49:49 首次发布