1. 单线程版
import requests
from lxml import etree
import time
import re
class HaiBaoSpider():
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
self.start_url = "https://818ps.com/muban/haibao.html?user_source=r44926&bd_vid=9395005922550959355&sdclkid=b5gpA5fs15eG15eG"
def get_html(self, url):
response = requests.get(url, headers = self.headers)
if response.status_code == 200:
return response.content.decode("utf-8")
else:
return None
def parse_page(self, html_str, flag=1):
html = etree.HTML(html_str)
imgs = html.xpath("//img[@class='lazy']")
data = []
for i in imgs:
item = {}
item["img_name"] = i.xpath("./@alt")[0]
item["img_url"] = "https:" + i.xpath("./@img-original")[0]
data.append(item)
if flag == 0:
next_url = "https://818ps.com" + html.xpath("//li[@class='toppage']/a/@href")[0]
else:
next_url = "https://818ps.com" + html.xpath("//li[@class='toppage']/a/@href")[1]
print(next_url)
return data, next_url
def save_img(self, title, url):
title = re.sub(r"[^\u4E00-\u9FFF]", "", title)
end_name = re.search(r"\.[a-z]{3}[\?|!]", url).group()[:4]
file_name = title + end_name
fp = open("./data/海报图片爬虫/" + file_name, "wb")
fp.write(requests.get(url).content)
fp.close()
print(file_name + "写入成功...")
def run(self):
html_str = self.get_html(self.start_url)
data, next_url = self.parse_page(html_str, flag=0)
for d in data:
self.save_img(d["img_name"] , d["img_url"])
while next_url:
html_str = self.get_html(next_url)
data, next_url = self.parse_page(html_str)
for d in data:
self.save_img(d["img_name"], d["img_url"])
if __name__ == '__main__':
hbs = HaiBaoSpider()
hbs.run()
2.多线程版
import requests
from lxml import etree
import time
import re
import threading
import queue
class HaiBaoSpider():
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36"
}
self.start_url = "https://818ps.com/muban/haibao.html?user_source=r44926&bd_vid=9395005922550959355&sdclkid=b5gpA5fs15eG15eG"
self.list_url = queue.Queue(300)
self.html_text = queue.Queue(300)
self.img_urlAndTitle = queue.Queue(2000)
def get_html(self, url):
response = requests.get(url, headers = self.headers)
if response.status_code == 200:
return response.content.decode("utf-8")
else:
return None
def get_next_url(self, html_str, flag=1):
html = etree.HTML(html_str)
if flag == 0:
next_url = "https://818ps.com" + html.xpath("//li[@class='toppage']/a/@href")[0]
else:
next_url = "https://818ps.com" + html.xpath("//li[@class='toppage']/a/@href")[1]
return next_url
def get_list_url(self):
html_str = self.get_html(self.start_url)
next_url = self.get_next_url(html_str, flag=0)
self.list_url.put(next_url)
while next_url:
html_str = self.get_html(next_url)
next_url = self.get_next_url(html_str, flag=1)
self.list_url.put(next_url)
def get_list_html(self):
while True:
url = self.list_url.get()
list_html = self.get_html(url)
self.html_text.put(list_html)
self.list_url.task_done()
def get_img_urlAndTitle(self):
while True:
html_text = self.html_text.get()
html = etree.HTML(html_text)
imgs = html.xpath("//img[@class='lazy']")
for i in imgs:
item = {}
item["img_name"] = i.xpath("./@alt")[0]
item["img_url"] = "https:" + i.xpath("./@img-original")[0]
self.img_urlAndTitle.put(item)
self.html_text.task_done()
def save_imgs(self):
while True:
img_item = self.img_urlAndTitle.get()
title = re.sub(r"[^\u4E00-\u9FFF]", "", img_item["img_name"])
end_name = re.search(r"\.[a-z]{3}[\?|!]", img_item["img_url"]).group()[:4]
file_name = title + end_name
fp = open("./data/海报图片爬虫/" + file_name, "wb")
fp.write(requests.get(img_item["img_url"]).content)
fp.close()
print(file_name + "写入成功...")
self.img_urlAndTitle.task_done()
def run(self):
thread_list = []
#获取列表页url使用3个线程
for i in range(3):
thread_get_list_url = threading.Thread(target=self.get_list_url)
thread_list.append(thread_get_list_url)
#获取列表页html使用5个线程
for i in range(5):
thread_get_list_html = threading.Thread(target=self.get_list_html)
thread_list.append(thread_get_list_html)
#获取海报名称和图片url地址无需发器网络请求,故只用1个线程即可
thread_parse_html = threading.Thread(target=self.get_img_urlAndTitle)
thread_list.append(thread_parse_html)
#保存图片用10个线程
for i in range(10):
thread_save_img = threading.Thread(target=self.save_imgs)
thread_list.append(thread_save_img)
for t in thread_list:
t.setDaemon(True) #将线程设置为后台进程,当主进程结束时子进程也结束
t.start() #开启进程
time.sleep(1)
#令主线程等待,当全部队列为空时方可结束主线程
self.list_url.join()
self.html_text.join()
self.img_urlAndTitle.join()
if __name__ == '__main__':
hbs = HaiBaoSpider()
hbs.run()
爬取结果如下:
卡通立夏节气动态海报.png写入成功…
卡通风格预防接种停诊通知宣传海报.jpg写入成功…
创意扁平风青年节动态海报.png写入成功…
废弃口罩处理方式垃圾分类黄色卡通手机海报.jpg写入成功…
废弃口罩处理方式垃圾分类橙色卡通手机海报.jpg写入成功…
行动表达爱母亲节对话手绘海报.jpg写入成功…
简约风旅游出行横板海报.jpg写入成功…
母亲节花式晒单宣传手机海报.jpg写入成功…
简约创意五月你好日签海报.png写入成功…
简约创意五四节超市促销横版海报.jpg写入成功…
手绘风五一音乐趴乐器演奏竖版海报.jpg写入成功…
简约风医护人员招聘手机海报.jpg写入成功…
Process finished with exit code -1
写入图片如下: