总共爬取了前一百页,耗时大概有四五分钟
import requests
import re
import urllib
import os
import threading
from queue import Queue
gLock = threading.Lock()
HEADERS = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36'}
class producer(threading.Thread):
def __init__(self, page_url_queue, img_url_queue, *args, **kwargs):
super(producer, self).__init__(*args, **kwargs)
self.page_url_queue = page_url_queue
self.img_url_queue = img_url_queue
def run(self):
while True:
if self.page_url_queue.empty():
break
page_url = self.page_url_queue.get()
self.parser_url(page_url)
def parser_url(self, url):
resp = requests.get(url=url, headers=HEADERS)
text = resp.text
img_urls = re.findall(r'<img\sclass="ui\simage\slazy".*?src="(.*?)">', text, re.S) # 找到本页面上所有表情包的源链接
titles = re.findall(r'<img\sclass="ui\simage\slazy".*?title="(.*?)"\salt', text, re.S) # 找到所有表情包网站的默认命名
for index, url in enumerate(img_urls):
pure_title = re.sub(r"[\??\.,。!\!]", '', titles[index]).strip()
# 判断表情包的所属系列,如果网站没有指明则全部归入其他系列
if '-' in pure_title:
series = pure_title.split("-")[1].strip()
else:
series = '其它系列'
# 根据源链接结尾的jpg,gif属性等来给图片类型命名
tail = os.path.splitext(url)[1]
file_name = pure_title + tail
self.img_url_queue.put((url, file_name, series))
class consumer(threading.Thread):
def __init__(self, page_url_queue, img_url_queue, *args, **kwargs):
super(consumer, self).__init__(*args, **kwargs)
self.page_url_queue = page_url_queue
self.img_url_queue = img_url_queue
def run(self):
while True:
img_url, file_name, series_this = self.img_url_queue.get()
r = requests.get(img_url, stream=True)
# 根据图片类型放入不同的文件夹,如果文件夹存在,直接下载放入,若不存在则创建
if os.path.isdir('./image/%s' % series_this):
try:
if r.status_code == 200:
open('image/%s/%s' % (series_this, file_name), 'wb').write(r.content)
except OSError: # 有时候由于文件名存在特殊字符而不能按此命名保存,则跳出循环,不下载本图片
continue
else:
os.mkdir('./image/%s' % series_this)
try:
if r.status_code == 200:
open('image/%s/%s' % (series_this, file_name), 'wb').write(r.content)
except OSError:
continue
if self.page_url_queue.empty() and self.img_url_queue.empty():
break
def main():
page_url_queue = Queue(100)
img_url_queue = Queue(1000)
for x in range(1, 101):
ori_url = 'https://37yzy.com/bq/%d.html' % x
page_url_queue.put(ori_url)
for x in range(5):
t = producer(page_url_queue, img_url_queue)
t.start()
for x in range(5):
t = consumer(page_url_queue, img_url_queue)
t.start()
if __name__ == '__main__':
main()
总共做了大概四五次修改,第一次由于把url和file_name两个直接放入img_url_queue队列,导致不能运行,提示ValueError: too many values to unpack (expected 2),这时应该把url和file_name用括号写成img_url_queue.put((url,file_name))这样传递给队列,取出的时候用两个参数对用即可
第二次添加代码用来判断表情包的类型,如果网站上没有指明,则全部放入其它类型。(因为后面要根据所得的类型来放入对应的文件夹)。
第三次添加代码用来判断并创建表情包类型对应的文件夹。