多线程网络爬虫
多线程
直接上代码
# 进程 队列
from multiprocessing import Process, Queue
# 线程池
from concurrent.futures import ThreadPoolExecutor
from lxml import etree
import requests
# 获取文件下载路径
def get_img_src(q):
# url = "https://www.veer.com/topic/11751/"
for i in range (1, 10):
url = f"https://www.veer.com/topic/11751/?page={i}"
rsp = requests.get (url)
tree = etree.HTML (rsp.text)
href_list = tree.xpath ('//div/article/section/section/a/@href')
# print(href_list)
for href in href_list:
rsp2 = requests.get (href)
tree = etree.HTML (rsp2.text)
image = tree.xpath ('//div/main/section[1]/section[1]/section/figure[1]/div/img/@src')
# print(image)
image_update = "https:" + "".join (image)
# print (image_update)
q.put (image_update) # 向队列添加数据
print (f'{image_update},塞进队列')
# 传入为空 结束
q.put (' ')
def download(url):
print ("---------------start------------" + url)
name = url.split ("/")[-1]
resp = requests.get (url)
with open ('./img/' + name, mode='wb+') as f:
f.write (resp.content)
print ("---------------end------------" + url)
# 下载图片--给个线程池
def download_img(q):
# 做一个线程池做缓冲
with ThreadPoolExecutor (30) as t:
while 1:
src = q.get () # 从队列中获取数据,如果没有数据就会阻塞
if src == "":
break
t.submit (download, src)
if __name__ == '__main__':
q = Queue ()
p1 = Process (target=get_img_src, args=(q,))
p2 = Process (target=download_img, args=(q,))
p1.start ()
p2.start ()
# 如果有想要代码和了解更多的可以加 Eternity-taiko-66