python多线程爬取图虫网图片
直接上代码
import requests
import re
from urllib import request
import os
import threading
import queue
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.128 Safari/537.36'
}
class Producer(threading.Thread):
def __init__(self,page_queue,image_queue,title,i,*args,**kwargs):
super(Producer,self).__init__(*args,**kwargs)
self.page_queue = page_queue
self.image_queue = image_queue
self.title = title
self.i = i
def run(self):
while not self.page_queue.empty():
url = self.page_queue.get()
print(url)
i = str(self.i)
req = requests.get(url,headers=headers)
# print(req.content.decode('utf8'))
req = req.content.decode('utf8')
image_ids = imageurls(req)
# print(image_id)
''' 自行修改图片保存地址 '''
dirpath = os.path.join('E:/图虫创意',self.title)
if not os.path.exists(dirpath):
os.mkdir(dirpath)
for x,image_id in enumerate(image_ids):
image_url = 'https://weiliicimg9.pstatp.com/weili/sm/' + image_id + '.jpeg'
# print(image_url)
self.image_queue.put({'image_url':image_url,'image_path':os.path.join(dirpath, self.title + '_' + i + '_%d.jpg'%x)})
class Consumer(threading.Thread):
def __init__(self,image_queue,*args,**kwargs):
super(Consumer,self).__init__(*args,**kwargs)
self.image_queue = image_queue
def run(self):
while True:
try:
image_obj = self.image_queue.get(timeout=10)
imageurl = image_obj.get('image_url')
imagepath = image_obj.get('image_path')
try:
request.urlretrieve(imageurl,imagepath)
print(imagepath + "下载完成!")
except:
print(imagepath+"下载失败!")
except:
break
def imageurls(req):
image_id = re.findall(r'("\w+":"\d+")',req)
image_id = ''.join(image_id)
image_ids = re.findall(r"(\d+)",image_id)
return image_ids
def main():
title = input('请输入关键字:')
page = int(input('请输入爬取页数:'))
page_queue = queue.Queue(page)
image_queue = queue.Queue(1000)
for i in range(0,page):
url = 'https://stock.tuchong.com/search?page={}&platform=image&size=100&sortBy=0&term='.format(i) + title
page_queue.put(url)
for x in range(3):
th = Producer(page_queue,image_queue,title,i,name="生产者%d号"%x)
th.start()
for x in range(5):
th = Consumer(image_queue,name="消费者%d号"%x)
th.start()
if __name__ == '__main__':
main()
该博客展示了如何使用Python进行多线程爬取图虫网站上的图片。代码中定义了Producer和Consumer两个线程类,Producer负责从网页中获取图片URL并放入队列,Consumer则从队列中取出URL并下载图片。用户输入关键字和爬取页数,程序将按照指定条件下载相应页面的图片。
2万+

被折叠的 条评论
为什么被折叠?



