使用urllib,re,queue,threading,bs4,requests多线程队列爬取图片到本地保存
from urllib import request
from bs4 import BeautifulSoup as bs
import threading
from queue import Queue
import requests
import re
import os
import socket
url = 'http://www.ccppg.cn/books/ts/index_2.html'
socket.setdefaulttimeout(5)
class GetPic(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while not self.queue.empty():
url = self.queue.get_nowait()
self.spider(url)
def spider(self, url):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36',
'Referer': 'http://www.ccppg.cn/'
}
r = requests.get(url=url, headers=headers)
r.encoding = 'GBK'
soup = bs(r.text, 'html.parser')
y = soup.find_all('img', src=re.compile(r'.jpg'))
for i in y:
if 'height="159"' in str(i):
t = re.findall('src="(.*?)"', str(i))
for tt in t:
name = tt.split('/')[-1]
dirname = 'pic'
filepath = dirname + '/' + name
if not os.path.exists(dirname):
os.mkdir(dirname)
try:
opener = request.build_opener()
opener.addheaders = [('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.75 Safari/537.36')]
request.install_opener(opener)
request.urlretrieve(url=tt, filename=filepath + name)
except socket.timeout:
count = 1
while count <= 5:
try:
request.urlretrieve(url=tt, filename=filepath + name)
break
except socket.timeout:
err_info = 'Reloading for %d time' % count if count == 1 else 'Reloading for %d times' % count
print(err_info)
count += 1
if count > 5:
print("download job failed!")
def main():
queue = Queue(500)
for i in range(2, 18):
queue.put('http://www.ccppg.cn/books/ts/index_' + str(i) + '.html')
threads = []
thread_count = 50
for i in range(thread_count):
threads.append(GetPic(queue))
for t in threads:
t.start()
t.join()
if __name__ == '__main__':
main()
执行结果: