import requests
import bs4
import multiprocessing
import threading
import time
from multiprocessing import Queue
exitFlag = 0
start = time.time()
class myThread (threading.Thread):
def __init__(self, q):
threading.Thread.__init__(self)
self.q = q
def run(self):
while not self.q.empty():
crawler(self.q)
return self.q.qsize()
base_url = ''
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
"Connection": "keep-alive",
'Pragma': 'no-cache',
}
def crawler(q):
structure, url = q.get(timeout=2)
try:
res =requests.get(base_url+url, headers=headers)
soup = bs4.BeautifulSoup(res.content.decode('gb18030'), 'lxml')
# Todo:数据处理
except Exception as e:
pass
pass
def generate(filename):
link_list = []
with open(filename, 'r', encoding='utf8') as f1:
for line in f1.readlines():
line = line.split()
link_list.append((' '.join(line[:-1]), line[-1]))
return link_list
def mp(link_list, num):
'''
:link_list:任务列表
:num:进程数量
'''
queueLock = threading.Lock()
# 进程共享队列
workQueue = Queue(len(link_list))
queueLock.acquire()
# 填充
for word in link_list:
workQueue.put(word)
queueLock.release()
# 进程队列
threads = []
# 创建新线程
for i in range(num):
thread = myThread(workQueue)
thread.start()
threads.append(thread)
# 等待队列清空
while not workQueue.empty():
pass
# 通知线程是时候退出
exitFlag = 1
# 等待所有线程完成
for t in threads:
t.join()
print('***********', time.time()-start)