>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
仅学习参考
说明:先将待提取的url提取出来,然后交给urllib.request.urlretrieve函数去下载,
自动调用schedule函数,显示当前下载的进度
schedule函数主要有3个参数:blocknum(已经下载的数据块)、blocksize(数据块的大小)、totalsize(远程文件的大小)
<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
import urllib
import requests
from lxml import etreeimport os
class TianTangTuPian(object):
def __init__(self):
pass
def get_request(self,url):
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36",
}
response_ = requests.get(url=url,headers=headers)
return response_
def parse_html_1(self,response):
item_list = list()
html_ = etree.HTML(response.content.decode())
li_list = html_.xpath("//ul[contains(@class,'ali')]/li")
for li in li_list:
item = dict()
title = li.xpath('./div/a/img/@alt')
item['title'] = title[0] if title else None
href = li.xpath("./div/a/img/@src")
href = urllib.parse.urljoin(base=response.url,url=href[0] if href else None)
item['href'] = href
item_list.append(item)
return item_list
def schedule(self,blocknum,blocksize,totalsize):
per = 100.0*blocknum*blocksize/totalsize
if per>100:
per=100
print("当前下载进度:%d" % per)
def save_image(self,item_list):
if os.path.exists(path='./Image'):
pass
else:
os.mkdir(path='./Image')
for item in item_list:
title = item['title']
href = item['href']
urllib.request.urlretrieve(url=href,filename='./Image/{}.jpg'.format(title),reporthook=self.schedule)
print("写入{}成功".format(title))
def run(self):
first_url = "https://www.ivsky.com/tupian/ziranfengguang/index_1.html"
response_ = self.get_request(url=first_url)
item_list = self.parse_html_1(response_)
self.save_image(item_list)
if __name__=="__main__":
obj = TianTangTuPian()
obj.run()