爬虫学习之图片懒加载
完整代码
import os
from lxml import etree
import requests
headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36'
}
dirName = 'Girllib'
if not os.path.exists(dirName):
os.mkdir(dirName)
# 写一个url模板
url = 'https://sc.chinaz.com/tag_tupian/yazhou_%d.html'
# 连续爬六页
for page in range(1,6):
if page == 1:
re_url = 'https://sc.chinaz.com/tag_tupian/yazhou.html'
else:
re_url = format(url%page)
response = requests.get(url=re_url,headers=headers)
response.encoding = 'utf-8'
page_text = response.text
tree = etree.HTML(page_text)
# XPATH编写
li_list = tree.xpath('//div[@class="box picblock col3"]/div/a/@href')
list_index = []
for li in li_list:
big_url = 'http://sc.chinaz.com' + li
list_index.append(list(big_url.split()))
# 进入大图网页,下载高清大图
for index in list_index:
# print(index[0])
sub_response = requests.get(url= index[0], headers=headers)
sub_response.encoding = 'utf-8'
sub_text = sub_response.text
sub_tree = etree.HTML(sub_text)
try:
# 再次数据解析
# img_src为网址
img_src = 'http:' + sub_tree.xpath('//div[@class="imga"]//a/@href')[0]
# title为本地保存的名字
title = sub_tree.xpath('//div[@class="imga"]//a/@title')[0] + '.jpg'
img_data = requests.get(url=img_src,headers=headers).content
# 保存位置
imgPath = dirName + '/' + title
with open(imgPath, 'wb') as fb:
fb.write(img_data)
print(title, '保存成功!')
except Exception as e:
print(e)