超清壁纸爬虫
概述
本文介绍如何使用python爬虫实现超清壁纸的下载。
思路:通过requests模块对网页发起url请求,使用xpath解析提取图片链接,将二进制数据保存至电脑桌面并创建对应的文件夹!
提示:以下是实现源码,仅供参考。
一、超清壁纸搜索并下载
import os
try:
from lxml import etree
import requests
except:
print("正在安装必需的数据库, 请稍等...")
os.popen(cmd='pip install lxml -i https://pypi.doubanio.com/simple/').read()
os.popen(cmd='pip install requests -i https://pypi.doubanio.com/simple/').read()
else:
pass
import requests
from lxml import etree
def meitu_search_engine(keyword='美女', page_num = 1):
""" 搜索美图, 并自动下载到当前目录 """
global page_text_4, response_2, page_text_3
url_1 = f"https://www.bizhizu.cn/search/{keyword}/{page_num}.html"
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/72.0.3626.81 Safari/537.36 SE 2.X MetaSr 1.0"
}
response = requests.get(url=url_1, headers=headers, timeout=10) # 发起URL请求
response.encoding = 'utf-8'
# print("网页响应状态码:", response.status_code)
if response.status_code == 200:
print("\t网页访问成功!")
# print("url:", response.url)
page_text_1 = response.text
# print(page_text_1)
tree_1 = etree.HTML(page_text_1) # 实例化一个etree对象
list_a = tree_1.xpath('//div[@class="imgcont"]/ul/li/a/text()')
list_a_href = tree_1.xpath('//div[@class="imgcont"]/ul/li/a[2]/@href')
# print(len(list_a), list_a)
# print(len(list_a_href),list_a_href)
list_meitu_source_code = []
for link in list_a_href:
# print(link)
while True:
try:
response_2 = requests.get(url=link, headers=headers, timeout=3)
except:
continue
else:
break
response_2.encoding = 'utf-8'
page_text_2 = response_2.text
tree_2 = etree.HTML(page_text_2)
list_link_2 = tree_2.xpath('//p[@class="text_con" and @id="photoDesc"]/a[1]/@href')
# print("list_link_2 = ", list_link_2)
print("\t爬取图片地址链接:", list_link_2)
while True:
try:
page_text_3 = requests.get(url=f"https://www.bizhizu.cn{list_link_2[0]}", headers=headers,
timeout=3).text
except:
continue
else:
break
# print(page_text_3)
tree_3 = etree.HTML(page_text_3)
list_link_3 = tree_3.xpath('//div[@class="show-pages-imgshow"]/img/@src')
# print("list_link_3 = ", list_link_3)
while True:
try:
page_text_4 = requests.get(url=list_link_3[0], headers=headers, timeout=3).content # 访问图片源码, 以二进制返回
except:
continue
else:
break
# print(page_text_4)
list_meitu_source_code.append(page_text_4)
list_name = list_a
# print(len(list_name))
# print(len(list_meitu_source_code))
print(f"\t爬取的照片数量 = {len(list_name)}")
print(f"\t爬取的照片链接数量 = {len(list_meitu_source_code)}")
for name, link in zip(list_name, list_meitu_source_code):
with open(f"./{keyword}/{name}.jpg", mode='wb') as obj:
obj.write(link)
return
def check_floder(file_name):
""" 检查文件夹是否存在, 否则创建它 """
try:
os.listdir(file_name)
except:
os.mkdir(file_name)
else:
pass
return
if __name__ == '__main__':
while True:
while True:
try:
keyword = str(input("请输入搜索关键字:")).strip()
if len(keyword) > 0:
pass
else:
8/0
except:
print("\t您输入的不合法!")
continue
else:
break
while True:
try:
page_num = int(input("你打算取多少页图片啊?(>0):"))
if page_num > 0:
pass
else:
8/0
except:
print("\t您输入的不合法!")
continue
else:
break
check_floder(file_name=keyword)
print('\t-------------------------------')
for i in range(1, page_num+1, 1):
print(f"\t当前爬取的是第{i}页")
try:
meitu_search_engine(keyword=keyword, page_num=i)
except:
print("\t没有搜到当前页的数据~")
else:
pass
print('\t-------------------------------')
select = input("\n\t您打算继续搜索更多的吗?(y/n):")
if select in ['n', 'N', 'NO', 'No']:
break
else:
print()
continue