图片很绿色,样板如下:
同步爬虫
import requests
from lxml import etree
from queue import Queue
from urllib import request
import os
import re
def parse_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
response = requests.get(url, headers=headers)
# text = response.text # 解析失败用下面
text = response.content.decode('utf-8')
html = etree.HTML(text)
imgs = html.xpath("//div[@class='TypeList']/ul/li//img")
names = html.xpath("//div[@class='TypeList']/ul/li//span")
imgurls = []
filenames = []
for img in imgs:
img_url = img.get('src')
img_url = request.quote(img_url, safe=';/?:@&=+$,', encoding='utf-8') # 解决URL中文编码问题
imgurls.append(img_url) # url中存在中文是无法被request请求的
for index, name in enumerate(names):
filename = name.text
filename = re.sub(r'[\s]', '_', filename)
suffix = os.path.splitext(imgurls[index])[1]
filenames.append(filename+suffix)
img_download(imgurls, filenames)
def img_download(imgurls, filenames):
for index, imgurl in enumerate(imgurls):
request.urlretrieve(imgurl, 'images/'+filenames[index])
print('%s下载完毕' % filenames[index])
def main():
for i in range(1, 10):
url = 'http://www.umei.cc/tupiandaquan/shuaigetupian/%d.htm' % i
parse_page(url)
break
if __name__ == '__main__':
main()
升级异步爬虫
import requests
from lxml import etree
from queue import Queue
from urllib import request
import os
import re
import threading
import socket
socket.setdefaulttimeout(20)
class Producer(threading.Thread):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
}
def __init__(self, page_queue, img_queue):
super(Producer, self).__init__()
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
flag = 0 # 定义初始空页队列为空不算,重试作业
while True:
if self.page_queue.empty() and flag == 1:
print('页面队列为空,退出URL生产')
break
if self.page_queue.empty() and flag == 0:
print('初始页面队列为空,重新初始化')
continue
url = self.page_queue.get()
self.parse_page(url)
flag = 1
def parse_page(self, url):
response = requests.get(url, headers=self.headers)
# text = response.text # 解析失败用下面
text = response.content.decode('utf-8')
html = etree.HTML(text)
imgs = html.xpath("//div[@class='TypeList']/ul/li//img")
names = html.xpath("//div[@class='TypeList']/ul/li//div[@class='ListTit']")
imgurls = []
for img in imgs:
img_url = img.get('src')
imgurls.append(img_url)
for index, name in enumerate(names):
filename = name.text
filename = re.sub(r'[\s]', '_', filename)
suffix = os.path.splitext(imgurls[index])[1]
self.img_queue.put((imgurls[index], filename+suffix))
class Consumer(threading.Thread):
def __init__(self, page_queue, img_queue):
super(Consumer, self).__init__()
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.page_queue.empty() and self.img_queue.empty():
print('img队列和页面队列均为空,退出下载')
break
self.img_download(self.img_queue)
def img_download(self, img_queue):
img_url, filename = img_queue.get()
print('##################################################')
print('loading ------正在下载图片:', filename)
try:
print(img_url + filename + '\n')
img_url = request.quote(img_url, safe=';/?:@&=+$,', encoding='utf-8')
request.urlretrieve(img_url, 'images/'+filename)
except socket.timeout:
print('%s首次下载失败:' % filename)
count = 1
while count <= 5:
print('正在重试第%s次作业' % count)
try:
request.urlretrieve(img_url, 'images/'+filename)
break
except socket.timeout:
print('Reloading for %d times【warming】' % count)
count += 1
if count > 5:
print("%s所有重试作业失败【Faild】" % filename)
else:
print('%s重试作业下载完毕【info】' % filename)
print('%s------下载完毕' % filename)
print('************************************************************************************\n\n')
def main():
page_queue = Queue(100)
img_queue = Queue(1000)
for i in range(1, 26):
url = 'http://www.umei.cc/p/gaoqing/cn/%s.htm' % i
page_queue.put(url)
for x in range(3):
t = Producer(page_queue, img_queue)
t.start()
for x in range(3):
t = Consumer(page_queue, img_queue) # 下载图片线程
t.start()
# str = 'http://i1.whymtj.com/uploads/tu/201901/10002/caodq91_看图王.jpg'
# a = request.quote(str, safe=';/?:@&=+$,', encoding='utf-8')
# print(a)
if __name__ == '__main__':
main()
解决URL中文编码问题启发自该博文https://blog.csdn.net/mouday/article/details/80278938
嗯,藕要拿小本本记一下