import requests
from lxml import etree
from urllib import request
import os
headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36"}
BASE_RUL ="https://www.doutula.com/article/list/?page={}"defparse_html(url):
html = requests.get(url,headers=headers)
text = html.text
html = etree.HTML(text)
content = html.xpath("//div[@class='col-sm-9 center-wrap']/a//img[@class!='gif']")for v in content:
data_url = v.get("data-original")print(data_url)
name = os.path.split(data_url)print(name[1])
request.urlretrieve(data_url,"image/"+name[1])defmain(low,hight):for i inrange(low,hight):
parse_html(BASE_RUL.format(i))if __name__ =='__main__':
main(1,10)
多线程
import threading
from queue import Queue
headers ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36"}
BASE_RUL ="https://www.doutula.com/article/list/?page={}"
que = Queue(20)defparse_h(url):global que
html = requests.get(url, headers=headers)
text = html.text
html = etree.HTML(text)
content = html.xpath("//div[@class='col-sm-9 center-wrap']/a//img[@class!='gif']")for v in content:
data_url = v.get("data-original")
que.put(data_url)defget_url(low=1,hights=2):global BASE_RUL
for i inrange(low,hights):print(BASE_RUL.format(i))
parse_h(BASE_RUL.format(i))defdownload():whileTrue:if que.not_empty:
data_url = que.get()print(data_url)
name = os.path.split(data_url)print(name[1])
request.urlretrieve(data_url,"image/"+ name[1])
que.task_done()else:print("{} exit".format(threading.current_thread()))breakdefmain():global que
g = threading.Thread(target=get_url)
g.setDaemon(True)
g.start()for i inrange(5):
d = threading.Thread(target=download)
d.setDaemon(True)
d.start()
g.join()#当生产者结束
que.join()# time.sleep(20)if __name__ =='__main__':
curent = time.time()
main()print(time.time()-curent)