基于多线程的并发爬虫

import requests
import bs4
import multiprocessing
import threading
import time
from multiprocessing import Queue


exitFlag = 0
start = time.time()
class myThread (threading.Thread):
    def __init__(self, q):
        threading.Thread.__init__(self)
        self.q = q

    def run(self):
        while not self.q.empty():
            crawler(self.q)
        return self.q.qsize()


base_url = ''
headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36",
        "Connection": "keep-alive",
        'Pragma': 'no-cache',
    }
def crawler(q):
    structure, url = q.get(timeout=2)
    try:
        res =requests.get(base_url+url, headers=headers)
        soup = bs4.BeautifulSoup(res.content.decode('gb18030'), 'lxml')
        # Todo:数据处理
    except Exception as e:
        pass
    pass

def generate(filename):
    link_list = []
    with open(filename, 'r', encoding='utf8') as f1:
        for line in f1.readlines():
            line = line.split()
            link_list.append((' '.join(line[:-1]), line[-1]))
    return link_list

def mp(link_list, num):
    '''
        :link_list:任务列表
        :num:进程数量
    '''
    queueLock = threading.Lock()
    # 进程共享队列
    workQueue = Queue(len(link_list))
    queueLock.acquire()
    # 填充
    for word in link_list:
        workQueue.put(word)
    queueLock.release()
    # 进程队列
    threads = []
    # 创建新线程

    for i in range(num):
        thread = myThread(workQueue)
        thread.start()
        threads.append(thread)

    # 等待队列清空
    while not workQueue.empty():
        pass

    # 通知线程是时候退出
    exitFlag = 1

    # 等待所有线程完成
    for t in threads:
        t.join()
print('***********', time.time()-start)

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值