目录
5.1工作原理
5.1.1原理示意图
5.1.2队列对象
queue是python中的标准库,可以直接from queue import Queue引用;队列是线程间最常用的交换数据的形式。对于资源,加锁是个重要的环节。Queue,是线程安全的,因此在满足使用条件下,建议使用队列
- q=Queue(3) # 创建队列,参数为队列的大小
- q.put(3,block=False) # 默认是block为阻塞,当为False时如果队列满了抛出满的异常
- q.put(3,timeout=3) # 设置超时时间,如果在3秒之后队列还是满的,抛出满的异常
- q.get(timeout=1) # 从队列中取出数据,设置超时时间为1秒,如果一秒之后队列还是为空,那么抛出空的异常
- q.get(block=False) # 默认block为True阻塞状态,如果block设置为flase,如果队列为空,那么抛出空的异常
- q.join() # 阻塞,直到队列q中的任务数降为0
- q.maxsize # 队列最大容量数
- q.qsize() # 队列中已存在的任务数
- q.get_nowait() # 队列中剩余空间数
- q.full() # 判断队列是否满了
- q.empty() # 判断队列是否为空
5.2队列锁和线程锁
import threading
from queue import Queue
dataQueue = Queue(100)
exitFlag = False
class MyThread(threading.Thread):
def __init__(self,q):
super().__init__()
self.queue = q
def run(self):
super().run()
global exitFlag
while True:
if exitFlag:
print('++++++++++++++++++++++++++exit')
break
try:
print('------------------------',self.queue.get(False))
self.queue.task_done()
except:
pass
def main():
for i in range(100):
dataQueue.put(i)
threads = []
for i in range(5):
thread = MyThread(dataQueue)
threads.append(thread)
thread.start()
# 队列锁
# dataQueue.join()
global exitFlag
exitFlag = True
print('exit ------------------------------------------------')
# 线程锁
for t in threads:
t.join()
if __name__ == '__main__':
main()
5.3线程池
from concurrent.futures import ThreadPoolExecutor
def task(url):
global num
num += 1
print(num)
def get_imgs_url(urls):
executor = ThreadPoolExecutor(max_workers=2)
for url in urls:
task1 = executor.submit(task,(url,))
task1.done() # 判定某个任务是否完成
task1.cancel() # cancel方法用于取消某个任务,该任务没有放入线程池中才能取消成功
task1.result() # result方法可以获取task的执行结果
executor.submit(task,(url,))
if __name__ == '__main__':
urls='https://www.baidu.com/'
get_imgs_url(urls)
5.4自定义线程
生产者消费者模式
novels.py
import requests
import time
import threading
from threading import Thread
from bs4 import BeautifulSoup
from config import BASE_URL, QUEUE_TASK, DENTY_TIME, QUEUE_PARSE, HEADERS, EXIT_FLAG
class CrawlThread(threading.Thread):
def __init__(self):
super().__init__()
self.q_task=QUEUE_TASK
self.q_parse=QUEUE_PARSE
def run(self) -> None:
super().run()
self.spider()
def spider(self):
while True:
if self.q_task.empty():
print('爬虫线程:{}线程结束执行'.format(threading.current_thread().getName()))
break
taskId=self.q_task.get() # 线程为自己领取任务
response=requests.get(url=BASE_URL%(taskId),headers=HEADERS) # 执行任务
response.encoding='utf-8' # 对任务进行解码
html=response.text # 获取到任务内容
self.q_parse.put((html,taskId)) # 将任务编号和对应的任务内容存在队列种
self.q_task.task_done() # 告诉队列,任务完成
print('爬虫线程{}完成第{}页爬取任务完成'.format(threading.current_thread().getName(),taskId))
def crawl():
for i in range(1,11):
QUEUE_TASK.put(i)
for i in range(2): # 产生两个线程
CrawlThread().start()
time.sleep(DENTY_TIME)
class ParseThread(threading.Thread):
def __init__(self,lock,fp):
super().__init__()
self.lock=lock
self.fp=fp
self.q_parse=QUEUE_PARSE
def run(self) -> None:
super().__init__()
self.parse()
def parse(self):
while True:
if EXIT_FLAG:
print('解析线程:{}线程结束执行'.format(threading.current_thread().getName()))
break
try:
html,taskId=self.q_parse.get(block=False)
soup=BeautifulSoup(html,'lxml')
books = soup.select('div[class="bookslist"] > ul > li')
print('------------{}-----------'.format(len(books)))
for book in books:
self.lock.acquire()
book_url = book.find('img').attrs['src']
book_title = book.select('h3 a')[0]['title']
book_author = book.select('p')[0].get_text()
book_describe = book.select('p')[1].get_text()
fp.write('%s\t%s\t%s\t%s\n' % (book_url, book_title, book_author, book_describe))
self.lock.release()
self.q_parse.task_done()
print('解析线程{}完成第{}页解析任务完成'.format(threading.current_thread().getName(),taskId))
except:
pass
def parse(fp):
lock= threading.Lock()
for i in range(2):
ParseThread(lock,fp).start()
if __name__ == '__main__':
crawl()
fp = open('./book.txt', 'a', encoding='utf-8')
parse(fp)
QUEUE_TASK.join()
QUEUE_PARSE.join()
fp.close()
EXIT_FLAG=True
print('代码执行结束')
config.py
from queue import Queue
HEADERS={
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3",
"Accept-Encoding":"gzip, deflate, br",
"Accept-Language":"zh-CN,zh;q=0.9,en;q=0.8",
"Cache-Control":"no-cache",
"Connection":"keep-alive",
"Cookie":"Hm_lvt_8008bbd51b8bc504162e1a61c3741a9d=1572674620; Hm_lpvt_8008bbd51b8bc504162e1a61c3741a9d=1572674667",
"Host":"www.dushu.com",
"Pragma":"no-cache",
"Upgrade-Insecure-Requests":"1",
"User-Agent":"Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Mobile Safari/537.36",
}
EXIT_FLAG=False
BASE_URL='https://www.dushu.com/book/1175_%d.html'
NUMBER=10
QUEUE_TASK=Queue(NUMBER)
QUEUE_PARSE=Queue(NUMBER)
DENTY_TIME=1