图片的懒加载
使用xpath获取图片:
代码:
import os
import urllib.request
import urllib.parse
from lxml import etree
def hanlderequest(url,page):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.114 Safari/537.36 Edg/91.0.864.59'
}
if page!=1:
# 拼接网页url,因为第一页的地址和其它地址不一样
url = url[:-5]+'_'+str(page)+'.html'
request = urllib.request.Request(headers=headers,url=url)
resp = urllib.request.urlopen(request)
return resp
def parse_data(data):
tree = etree.HTML(data)
# 得到img图片的地址,这里存在图片的懒加载,因为当图片展示在你面前时src2才会变成src属性这是前端的一个技术,为了更好的用户的体验
img_url = tree.xpath("//div[@id='container']/div/div/a/img/@src2")
return img_url
def save_data(img_url):
for img in img_url:
url = "http:"+img
dirpath = 'tupian'
# 创建一个文件夹
if not os.path.exists(dirpath):
os.mkdir(dirpath)
# 生成一个相应文件夹名
filename = os.path.basename(img)
# 存取图片路径
filepath = os.path.join(dirpath,filename)
# 加载每一张图片
urllib.request.urlretrieve(url=url,filename=filepath)
def main(start_page,end_page):
url = 'https://sc.chinaz.com/tupian/siwameinvtupian.html'
for page in range(start_page,end_page+1):
# 构造响应对象
resp = hanlderequest(url,page)
# 获取响应数据
data = resp.read().decode()
# 解析数据
img_url = parse_data(data)
# 保存图片
save_data(img_url)
if __name__ == '__main__':
start_page = int(input('请输入开始页码:'))
end_page = int(input('请输入结束页码:'))
main(start_page,end_page)