思路:1. 创建一个队列
2. 把“被爬URL”中所有图片的链接放到队列中,get_photo_urls
3. 创建多个线程,调用get_photos
4. get_photos从队列中取图片链接,把图片保存到本地
def get_photo_urls(q, origin_url):
#获取该origin_url下所有图片的url
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36'
}
r = requests.get(origin_url, headers=headers)
for photo_url in re.finditer(r'src="(.*?)"', r.text):
url = photo_url.group(1)
print(url)
if url.endswith(".gif") or url.endswith(".jpg") or url.endswith(".png"):
q.put(url)
def get_photos(q,lock):
#保存图片到本地D:\test目录下
global photo_count
while True:
#队列中的url为空时,跳出
if q.empty():
break
photo_url = q.get()
#控制下载图片的大小为0.4M
if len(requests.get(photo_url).content)>0.4*1024*1024:
print(photo_url)
lock.acquire()
photo_count += 1
lock.release()
#保存图片
urlretrieve(photo_url, "D:\\test\\%s."%photo_count + photo_url.split(".")[-1])
if __name__ == "__main__":
origin_url = 'http://www.win4000.com/wallpaper.html'
#队列不存在同时抢占同一资源的情况,所以获取数据不用加锁
q = queue.Queue()
get_photo_urls(q, origin_url)
tasks = []
photo_count= 0
#线程锁,避免同时抢占同一个资源
lock = threading.Lock()
#创建10个线程
for i in range(10):
tasks.append(threading.Thread(target=get_photos, args=(q,lock)))
#线程设置到就绪状态
for task in tasks:
task.start()
#等待所有线程都执行完毕,主程序才开始往下执行
for task in tasks:
task.join()
执行结果:抓取图片成功