【爬虫教程】多线程爬虫05

目录

5.1工作原理

5.1.1原理示意图

5.1.2队列对象

5.2队列锁和线程锁

5.3线程池

5.4自定义线程


5.1工作原理

5.1.1原理示意图

5.1.2队列对象

queue是python中的标准库,可以直接from queue import Queue引用;队列是线程间最常用的交换数据的形式。对于资源,加锁是个重要的环节。Queue,是线程安全的,因此在满足使用条件下,建议使用队列

  • q=Queue(3)  # 创建队列,参数为队列的大小
  • q.put(3,block=False)  # 默认是block为阻塞,当为False时如果队列满了抛出满的异常
  • q.put(3,timeout=3)  # 设置超时时间,如果在3秒之后队列还是满的,抛出满的异常
  • q.get(timeout=1)  # 从队列中取出数据,设置超时时间为1秒,如果一秒之后队列还是为空,那么抛出空的异常
  • q.get(block=False)  # 默认block为True阻塞状态,如果block设置为flase,如果队列为空,那么抛出空的异常
  • q.join()  # 阻塞,直到队列q中的任务数降为0
  • q.maxsize  # 队列最大容量数
  • q.qsize()  # 队列中已存在的任务数
  • q.get_nowait()  # 队列中剩余空间数
  • q.full()  # 判断队列是否满了
  • q.empty()  # 判断队列是否为空

5.2队列锁和线程锁

import threading
from queue import  Queue
dataQueue = Queue(100)
exitFlag = False
  
class MyThread(threading.Thread):
    def __init__(self,q):
        super().__init__()
        self.queue = q
    def run(self):
        super().run()
        global exitFlag
        while True:
            if exitFlag:
                print('++++++++++++++++++++++++++exit')
                break
            try:
                print('------------------------',self.queue.get(False))
                self.queue.task_done()
            except:
                pass
  
def main():
    for i in range(100):
        dataQueue.put(i)
  
    threads = []
    for i in range(5):
        thread = MyThread(dataQueue)
        threads.append(thread)
        thread.start()
    # 队列锁
    # dataQueue.join()
    global  exitFlag
    exitFlag = True
    print('exit ------------------------------------------------')
    # 线程锁
    for t in threads:
        t.join()
  
if __name__ == '__main__':
    main()

5.3线程池

from concurrent.futures import ThreadPoolExecutor
def task(url):
    global num
    num += 1
    print(num)

def get_imgs_url(urls):
    executor = ThreadPoolExecutor(max_workers=2)
    for url in urls:
        task1 = executor.submit(task,(url,))
        task1.done() # 判定某个任务是否完成
        task1.cancel() # cancel方法用于取消某个任务,该任务没有放入线程池中才能取消成功
        task1.result() # result方法可以获取task的执行结果
        executor.submit(task,(url,))
if __name__ == '__main__':
    urls='https://www.baidu.com/'
    get_imgs_url(urls)

5.4自定义线程

生产者消费者模式

novels.py

import requests
import time
import threading
from threading import Thread
from bs4 import BeautifulSoup
from config import BASE_URL, QUEUE_TASK, DENTY_TIME, QUEUE_PARSE, HEADERS, EXIT_FLAG

class CrawlThread(threading.Thread):
    def __init__(self):
        super().__init__()
        self.q_task=QUEUE_TASK
        self.q_parse=QUEUE_PARSE
    def run(self) -> None:
        super().run()
        self.spider()


    def spider(self):
        while True:
            if self.q_task.empty():
                print('爬虫线程:{}线程结束执行'.format(threading.current_thread().getName()))
                break
            taskId=self.q_task.get()  # 线程为自己领取任务
            response=requests.get(url=BASE_URL%(taskId),headers=HEADERS)  # 执行任务
            response.encoding='utf-8'  # 对任务进行解码
            html=response.text  # 获取到任务内容
            self.q_parse.put((html,taskId))  # 将任务编号和对应的任务内容存在队列种
            self.q_task.task_done()  # 告诉队列,任务完成
            print('爬虫线程{}完成第{}页爬取任务完成'.format(threading.current_thread().getName(),taskId))

def crawl():
    for i in range(1,11):
        QUEUE_TASK.put(i)
    for i in range(2):  # 产生两个线程
        CrawlThread().start()
    time.sleep(DENTY_TIME)

class ParseThread(threading.Thread):
    def __init__(self,lock,fp):
        super().__init__()
        self.lock=lock
        self.fp=fp
        self.q_parse=QUEUE_PARSE
    def run(self) -> None:
        super().__init__()
        self.parse()


    def parse(self):
        while True:
            if EXIT_FLAG:
                print('解析线程:{}线程结束执行'.format(threading.current_thread().getName()))
                break
            try:
                html,taskId=self.q_parse.get(block=False)
                soup=BeautifulSoup(html,'lxml')
                books = soup.select('div[class="bookslist"] > ul > li')
                print('------------{}-----------'.format(len(books)))
                for book in books:
                    self.lock.acquire()
                    book_url = book.find('img').attrs['src']
                    book_title = book.select('h3 a')[0]['title']
                    book_author = book.select('p')[0].get_text()
                    book_describe = book.select('p')[1].get_text()
                    fp.write('%s\t%s\t%s\t%s\n' % (book_url, book_title, book_author, book_describe))
                    self.lock.release()
                self.q_parse.task_done()
                print('解析线程{}完成第{}页解析任务完成'.format(threading.current_thread().getName(),taskId))
            except:
                pass

def parse(fp):
    lock= threading.Lock()
    for i in range(2):
        ParseThread(lock,fp).start()

if __name__ == '__main__':
    crawl()
    fp = open('./book.txt', 'a', encoding='utf-8')
    parse(fp)
    QUEUE_TASK.join()
    QUEUE_PARSE.join()
    fp.close()
    EXIT_FLAG=True
    print('代码执行结束')

config.py

from queue import Queue
HEADERS={
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control":"no-cache",
"Connection":"keep-alive",
"Cookie":"Hm_lvt_8008bbd51b8bc504162e1a61c3741a9d=1572674620; Hm_lpvt_8008bbd51b8bc504162e1a61c3741a9d=1572674667",
"Host":"www.dushu.com",
"Pragma":"no-cache",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Mobile Safari/537.36",
}
EXIT_FLAG=False
BASE_URL='https://www.dushu.com/book/1175_%d.html'
NUMBER=10
QUEUE_TASK=Queue(NUMBER)
QUEUE_PARSE=Queue(NUMBER)
DENTY_TIME=1

 

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值