导入模块模拟浏览器
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
发送请求提取数据
response = requests.get(url_2, headers=headers).content.decode('gbk')
html = etree.HTML(response)
data = html.xpath('//div[@class="list"]/ul/li/a/@href')
# print(data)
for循环提取图片详情页的url和图片名
for href in data:
new_url = 'http://www.netbian.com' + href
# print(new_url) # 对每个新的new_url网址发送请求
res = requests.get(new_url, headers=headers).content.decode('gbk')
# print(res)
data_1 = etree.HTML(res)
title = data_1.xpath('//div[@class="pic"]/p/a/img/@title') # [0] 索引取值
new_title = ''.join(title) # 列表转为字符串
pic_url = data_1.xpath('//div[@class="pic"]/p/a/@href')
pic_str = ''.join(pic_url)
# print(pic_str)
pic_url2 = 'http://www.netbian.com' + pic_str
# print(pic_url2)
# 在对每个pic_url2的网址发送请求
html_data = requests.get(pic_url2, headers=headers).content.decode('gbk')
# print(html_data)
html_obj = etree.HTML(html_data) # 解析数据
img_url = html_obj.xpath('//table/tr/td//a/@href') # 获取每张图片的url
# print(img_url)
img_str = ''.join(img_url)
# print(img_str)
res_2 = requests.get(img_str, headers=headers).content
保存数据
with open(f'./彼岸壁纸/{new_title}.jpg', 'wb') as f:
f.write(res_2)
print('正在下载', new_title)
for循环遍历多页数据
page = int(input("请输入爬取的页数:"))
for i in range(1, page + 1):
if i <= 1:
url_2 = 'http://www.netbian.com/index.htm'
# print(url_2)
else:
url_2 = 'http://www.netbian.com/index_{}.htm'.format(i)
完整代码
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36'
}
page = int(input("请输入爬取的页数:"))
for i in range(1, page + 1):
if i <= 1:
url_2 = 'http://www.netbian.com/index.htm'
# print(url_2)
else:
url_2 = 'http://www.netbian.com/index_{}.htm'.format(i)
response = requests.get(url_2, headers=headers).content.decode('gbk')
html = etree.HTML(response)
data = html.xpath('//div[@class="list"]/ul/li/a/@href')
# print(data)
for href in data:
new_url = 'http://www.netbian.com' + href
# print(new_url) # 对每个新的new_url网址发送请求
res = requests.get(new_url, headers=headers).content.decode('gbk')
# print(res)
data_1 = etree.HTML(res)
# print(data_1)
title = data_1.xpath('//div[@class="pic"]/p/a/img/@title') # [0] 索引取值
new_title = ''.join(title) # 列表转为字符串
pic_url = data_1.xpath('//div[@class="pic"]/p/a/@href')
pic_str = ''.join(pic_url)
# print(pic_str)
pic_url2 = 'http://www.netbian.com' + pic_str
# print(pic_url2)
# 在对每个pic_url2的网址发送请求
html_data = requests.get(pic_url2, headers=headers).content.decode('gbk')
# print(html_data)
html_obj = etree.HTML(html_data) # 解析数据
img_url = html_obj.xpath('//table/tr/td//a/@href') # 获取每张图片的url
# print(img_url)
img_str = ''.join(img_url)
# print(img_str)
res_2 = requests.get(img_str, headers=headers).content
with open(f'./彼岸壁纸/{new_title}.jpg', 'wb') as f:
f.write(res_2)
print('正在下载', new_title)