基于多线程的并发爬虫

最新推荐文章于 2024-08-07 08:51:51 发布

蘑菇沏茶

最新推荐文章于 2024-08-07 08:51:51 发布

阅读量243

点赞数 1

本文链接：https://blog.csdn.net/qq_30124241/article/details/111181173

版权

爬虫

import requests
import bs4
import multiprocessing
import threading
import time
from multiprocessing import Queue


exitFlag = 0
start = time.time()
class myThread (threading.Thread):
    def __init__(self, q):
        threading.Thread.__init__(self)
        self.q = q

    def run(self):
        while not self.q.empty():
            crawler(self.q)
        return self.q.qsize()


base_url = ''
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
        "Connection": "keep-alive",
        'Pragma': 'no-cache',
    }
def crawler(q):
    structure, url = q.get(timeout=2)
    try:
        res =requests.get(base_url+url, headers=headers)
        soup = bs4.BeautifulSoup(res.content.decode('gb18030'), 'lxml')
        # Todo:数据处理
    except Exception as e:
        pass
    pass

def generate(filename):
    link_list = []
    with open(filename, 'r', encoding='utf8') as f1:
        for line in f1.readlines():
            line = line.split()
            link_list.append((' '.join(line[:-1]), line[-1]))
    return link_list

def mp(link_list, num):
    '''
        :link_list:任务列表
        :num:进程数量
    '''
    queueLock = threading.Lock()
    # 进程共享队列
    workQueue = Queue(len(link_list))
    queueLock.acquire()
    # 填充
    for word in link_list:
        workQueue.put(word)
    queueLock.release()
    # 进程队列
    threads = []
    # 创建新线程

    for i in range(num):
        thread = myThread(workQueue)
        thread.start()
        threads.append(thread)

    # 等待队列清空
    while not workQueue.empty():
        pass

    # 通知线程是时候退出
    exitFlag = 1

    # 等待所有线程完成
    for t in threads:
        t.join()
print('***********', time.time()-start)