导入第3方库:
from bs4 import BeautifulSoup
from spider.extra import utils
from multiprocessing import Process,Value,JoinableQueue,Manager,Pool
import threading
import argparse
from fake_useragent import UserAgent
import time,random,requests
定义爬虫进程:
class SpiderWork(Process):
def __init__(self,q,res_q,complete_num,begin_time,proxy_list,proxytime):
super().__init__()
self.q = q
self.res_q = res_q
self.complete_num = complete_num
self.proxy_list = proxy_list
self.init_time = time.time()
self.begin_time = begin_time
self.proxytime = proxytime
def run(self):
while True:
url = self.q.get()
self.crawl_url(url)
self.q.task_done()
def crawl_url(self,url):
if time.time() - self.init_time > self.pr