目标分析: 1、对主页发送请求 2、使用xpath解析数据 3、提取主页中每个详情页的url 4、for循环遍历对每个详情页的url发送请求 5、解析每个详情页的url数据 6、正则提取每个详情页的url数据中的图片地址 7、保存数据
1、对主页发送请求
import requests
from lxml import etree
import re
# 主页url
url = 'https://www.yituyu.com/w/4/'
# 模拟浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'
}
# 1、对主页发送请求
response = requests.get(url, headers=headers)
2、使用xpath解析数据
# 2、使用xpath解析数据
data = etree.HTML(html)
# 3、提取主页中每个详情页的url
data_url = data.xpath('/html/body/div[3]/div[3]/ul/li/div/a/@href')
# print(data_url)
3、提取主页中每个详情页的url 4、for循环遍历对每个详情页的url发送请求 5、解析每个详情页的url数据
for url_ in data_url:
url_str = ''.join(url_)
# print(url_str)
response_2 = requests.get(url_str, headers=headers).text
# print(response_2)
6、正则提取每个详情页的url数据中的图片地址
# 提取图片地址
url_re = re.findall('<img class="lazy t0" data-src="(.*?)".*?', response_2, re.S)
# print(url_re)
# 提取图片名
name_re = re.findall('<img class="lazy t0".*?alt="(.*?)".*?>.*?', response_2, re.S)
# print(name_re)
7、保存数据
for url_name in zip(name_re, url_re):
resp_3 = requests.get(url_name[1], headers=headers).content
name_ = url_name[0]
i = i + 1
name_1 = name_ + f'{i}'
# print(name_1)
with open(f'./tupian/{name_1}.jpg', 'wb') as f:
f.write(resp_3)
print(name_1 + " ------下载完成")
完整代码:
import requests
from lxml import etree
import re
# 主页url
url = 'https://www.yituyu.com/w/4/'
# 模拟浏览器
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.84 Safari/537.36'
}
# 1、对主页发送请求
response = requests.get(url, headers=headers)
html = response.text
# 2、使用xpath解析数据
data = etree.HTML(html)
# 3、提取主页中每个详情页的url
data_url = data.xpath('/html/body/div[3]/div[3]/ul/li/div/a/@href')
# print(data_url)
for url_ in data_url:
url_str = ''.join(url_)
# print(url_str)
response_2 = requests.get(url_str, headers=headers).text
# print(response_2)
# 提取图片地址
url_re = re.findall('<img class="lazy t0" data-src="(.*?)".*?', response_2, re.S)
# print(url_re)
# 提取图片名
name_re = re.findall('<img class="lazy t0".*?alt="(.*?)".*?>.*?', response_2, re.S)
# print(name_re)
i = 0
for url_name in zip(name_re, url_re):
resp_3 = requests.get(url_name[1], headers=headers).content
name_ = url_name[0]
i = i + 1
name_1 = name_ + f'{i}'
# print(name_1)
with open(f'./tupian/{name_1}.jpg', 'wb') as f:
f.write(resp_3)
print(name_1 + " ------下载完成")