一般下载
import requests
from lxml import etree
import os
import re
from urllib.request import urlretrieve
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36',
'reference':'https://www.doutula.com/photo/list/',
'cookie':'Hm_lvt_2fc12699c699441729d4b335ce117f40=1615008627; BAIDU_SSP_lcr=https://www.baidu.com/link?url=UQoesURrEsUM4NvE5ZacHkR-o392OBjm7Vf1Kmexgz6f1t95skqOdvaJhf-yr-ST&wd=&eqid=d489b93900083bf50000000560431370; _agep=1615008629; _agfp=71fff58d61b1da1742311be98d81dade; _agtk=c790afa4b042c24f31f6047911d8f716; Hm_lpvt_2fc12699c699441729d4b335ce117f40=1615008678'
}
def parse_page(url):
response = requests.get(url, headers=headers)
text = response.text
html = etree.HTML(text)
print(html)
#imgs = html.xpath("//div[@class='page-content text-center']//a//img")
imgs=html.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]')
i = 0
for img in imgs:
#print(etree.tostring(img,encoding='utf-8').decode('utf-8'))
img_url = img.xpath(".//@data-original")[0]
suffix = os.path.splitext(img_url)[1]
alt = img.xpath(".//@alt")[0]
alt= re.sub(r'[,。??,/\\·]', '', alt)
if alt=='':
i=i+1
alt='image%d'%i
img_name = alt + suffix
urlretrieve(img_url, 'images/'+img_name)
def spider():
for x in range(1, 2):
url = "http://www.doutula.com/photo/list/?page=%d" % x
parse_page(url)
if __name__ == '__main__':
spider()
多线程下载图片
import threading
import requests
from lxml import etree
from urllib import request
import os
import re
from queue import Queue
i = 0
class Producer(threading.Thread):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36'
}
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(Producer, self).__init__(*args,**kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
self.parse_page(url)
def parse_page(self,url):
response = requests.get(url, headers=self.headers)
text = response.text
html = etree.HTML(text)
imgs = html.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]')
global i
for img in imgs:
# print(etree.tostring(img,encoding='utf-8').decode('utf-8'))
img_url = img.xpath(".//@data-original")[0]
suffix = os.path.splitext(img_url)[1]
alt = img.xpath(".//@alt")[0]
alt = re.sub(r'[,。??,/\\·]', '', alt)
if alt == '':
i = i + 1
alt = 'image%d' % i
img_name = alt + suffix
#print(img_name)
self.img_queue.put((img_url, img_name))
# request.urlretrieve(img_url, 'images/'+img_name)
class Consumer(threading.Thread):
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(Consumer, self).__init__(*args,**kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
print("xiaofeiz")
if self.img_queue.empty():
if self.page_queue.empty():
return
img = self.img_queue.get(block=True)
url,filename = img
#request.urlretrieve(url,'images/'+filename)
print(filename+' 下载完成!')
def spider():
page_queue = Queue(5)#队列数目一定要和你爬取页数一样否则可能出错
img_queue = Queue(40)
for x in range(1, 6):
url = "http://www.doutula.com/photo/list/?page=%d" % x
page_queue.put(url)
for x in range(4):
t = Producer(page_queue,img_queue)
t.start()
for x in range(4):
t = Consumer(page_queue,img_queue)
t.start()
if __name__ == '__main__':
spider()
对于消费者跳出还存在一些问题,网上很多代码设置的页数比较大还是存下这个问题,
待解决,如果你有好方法欢迎留言