# -*- encoding=UTF-8 -*- ''' author:vfast name:spider data:2021/6/24 ''' import urllib from lxml import etree import requests def Schedule(blocknum, blocksize, totalsize): ''' :param blocknum:已经下载的数据块 :param blocksize: 数据块的大小 :param totalsize: 远程文件的大小 :return: ''' per = 100.0 * blocknum * blocksize / totalsize if per > 100: per = 100 print '当前下载进度:%d' % per user_agent = 'Mozilla/5.0 (X11; Linux x86_64; rv:78.0) Gecko/20100101 Firefox/78.0' headers = {'User-Agent': user_agent} r = requests.get('http://www.ivsky.com/tupian/ziranfengguang/', headers=headers) html = etree.HTML(r.text) img_urls = html.xpath('.//img/@src') i = 0 for img_url in img_urls: urllib.urlretrieve(img_url, 'img' + str(i) + '.jpg', Schedule) i += 1
来源:Python爬虫开发与项目实战