import requests
from lxml import etree
from queue import Queue
import re
from urllib.request import urlretrieve
import threading
import time
class ImgSpider(object):
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
self.url_list_q = Queue(100)
for i in range(2,100):
self.url_list_q.put("https://www.hahamx.cn/pic/new/{}".format(i))
self.html_q = Queue(100)
self.img_tit_url = Queue(8000)
self.img_tit_src = Queue(8000)
def get_list_html_q(self):#从列表页url队列中提取列表页url获取列表页html源码填充到列表页html源码队列
while True:
url = self.url_list_q.get()
html = requests.get(url, headers=self.headers)
time.sleep(1)
if html.status_code == 200:
self.html_q.put(html.text)
self.url_list_q.task_done()
def parse_list_html_q(self):#从列表页源码队列中提取源码获取到图片页标题和url填充到队列
while True:
text = self.html_q.get()
html = etree.HTML(text)
titles = html.xpath("//p[@class='word-wrap joke-main-content-text']/text()")
img_urls = html.xpath("//div[@class='joke-main-content clearfix']//a[@target='_blank']/@href")
img_urls = list(map(lambda url: "https://www.hahamx.cn" + url, img_urls))
t_i = list(map(lambda a,b: (a,b), titles, img_urls))
for i in t_i:
self.img_tit_url.put(i)
self.html_q.task_done()
def parse_img_page(self):#从图片页标题和url队列中获取到图片的src填充到src队列
while True:
title, url = self.img_tit_url.get()
text = requests.get(url, headers=self.headers).text
time.sleep(1)
html = etree.HTML(text)
img_src = "https:" + html.xpath("//div[@class='joke-main-content clearfix']//img[@class='joke-main-content-img lazy']/@data-original")[0]
self.img_tit_src.put((title, img_src))
self.img_tit_url.task_done()
def save_img(self):
i = 0
while True:
title, src = self.img_tit_src.get()
title = re.sub(r"[^\u4e00-\u9fa5]+", "", title)
end_name = re.findall(r"_\d+\.([a-z]+)", src)[0]
if title == "分享图片":
title = "分享图片{}".format(i)
file_name = "./data/imgs/" + title + "." + end_name
i += 1
else:
file_name = "./data/imgs/" + title.strip() + "." + end_name
urlretrieve(src, file_name)
time.sleep(1)
print(title + "." + end_name + "写入成功...")
self.img_tit_src.task_done()
def run(self):
thread_list = []
#创建3个线程执行获取列表页html源码的函数
for i in range(3):
t_get_list_html = threading.Thread(target=self.get_list_html_q)
thread_list.append(t_get_list_html)
#创建1个线程来解析列表页源码
t_parse_list_html = threading.Thread(target=self.parse_list_html_q)
thread_list.append(t_parse_list_html)
#创建5个线程解析图片页源码
for i in range(5):
t_parse_img_page = threading.Thread(target=self.parse_img_page)
thread_list.append(t_parse_img_page)
#创建5个线程来保存图片到本地
for i in range(5):
t_save = threading.Thread(target=self.save_img)
thread_list.append(t_save)
for t in thread_list:
t.setDaemon(True)
t.start()
q_list = [self.html_q, self.img_tit_url, self.img_tit_src]
time.sleep(5)
for q in q_list:
q.join()
print("主线程结束...")
isp = ImgSpider()
isp.run()
threading + Queue多线程爬虫下载图片
最新推荐文章于 2020-12-05 12:12:30 发布