import requests
import threading
import pymongo
from queue import Queue
import redis
import hashlib
class AiQiYiSpider:
def __init__(self):
self.url='https://pcw-api.iqiyi.com/search/recommend/list?channel_id=2&data_type=1&mode=11&page_id={}&ret_num=48&session=c42d9802a7254fba165c4034687ec5cf&three_category_id=15;must'
self.headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Referer': 'https://list.iqiyi.com/www/2/15-------------11-1-1-iqiyi--.html?s_source=PCW_SC'
}
self.client=pymongo.MongoClient(host='localhost',port=27017)
self.connection=self.client['py_spider']['Thread_AiQiYi']
self.redisCli=redis.Redis()
self.urlQueue=Queue()#url队列
self.jsonQueue=Queue()#解析数据队列
self.saveQueue=Queue()#保存数据队列
@staticmethod
def getMd5(item):
return hashlib.md5(str(item).encode('utf-8')).hexdigest()
def getUrl(self,pages):
for page in range(1,pages):
self.urlQueue.put(self.url.format(page))
def getInfo(self):
page = 0
while True:
url=self.urlQueue.get()
response=requests.get(url,headers=self.headers).json()
# print(response)
self.jsonQueue.put(response)
self.urlQueue.task_done()
page += 1
print("已经抓取完成{}页\n".format(page))
def parseInfo(self):
page=0
while True:
response = self.jsonQueue.get()
for data in response['data']['list']:
item = dict()
item['title'] = data['title']
item['playUrl'] = data['playUrl']
item['description'] = data['description']
self.saveQueue.put(item)
self.jsonQueue.task_done()
page+=1
print("已经解析完成{}页\n".format(page))
def saveInfo(self):
while True:
item=self.saveQueue.get()
md5Info=self.getMd5(item)
isMd5Insert=self.redisCli.sadd('movie:filter',md5Info)
if isMd5Insert:
try:
self.connection.insert_one(item)
print("插入成功\n",item)
except Exception as e:
print("插入失败\n",e)
finally:
self.saveQueue.task_done()
else:
print("数据重复!")
self.saveQueue.task_done()
def main(self):
# 初始化线程列表
threads=[]
threads.append(threading.Thread(target=self.getUrl,args=(10,)))
# 给响应代码开5个线程
for _ in range(5):
threads.append(threading.Thread(target=self.getInfo))
threads.append(threading.Thread(target=self.parseInfo))
threads.append(threading.Thread(target=self.saveInfo))
for thread in threads:
thread.daemon=True#设置守护线程,放置主线程无法退出
thread.start()#启动线程
# 由于将子线程设置为守护线程,因此主线程将不必等待子线程结束就会主动退出,因此数据可能还没有获取就被迫退出
# 故而需要对主线程设置阻塞条件,也就是直到所有队列中的数据都为0之后再退出主线程
for q in [self.urlQueue,self.jsonQueue,self.saveQueue]:
q.join()
print("主线程退出")
if __name__=='__main__':
spider=AiQiYiSpider()
spider.main()
多线程抓取某艺取视频信息
于 2023-12-10 17:16:18 首次发布