本文纯粹为了技术学习,内容如有侵权,请告知!
目标URL:https://bing.ioliu.cn
爬取的图片
Python用到的主要模块
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
代码实现
# coding: utf-8
import datetime
import time
import random
import requests
from lxml import etree
from concurrent.futures import ThreadPoolExecutor
user_agent = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
# 随机选择UA
header = {"User-Agent": random.choice(user_agent)}
# 设置图片保存路径
path = r"C:\Spider\bing\ "
# 开始计时,统计任务的总耗时
start = datetime.datetime.now()
# 爬取方法,传入多个页面的url
def get_images(url):
# 发送请求获取响应
response = requests.get(url, headers=header)
# 使用lxml模块提供的xpath提取图片链接地址
html = etree.HTML(response.text)
# 获取图片链接地址
img_link = html.xpath('//a[@class="ctrl download"]/@href')
# 获取到的图片链接需要拼接上get页面url->"https://bing.ioliu.cn"
img_link = ['https://bing.ioliu.cn' + x for x in img_link]
# 使用for循环从图片链接列表中提取图片内容并保存到本地
for img in img_link:
# img_content图片内容
# img_name图片名,这里直接图片链接地址中抽取一部分作为图片名,再加上后缀'.jpg'
img_content = requests.get(img, headers=header).content
img_name = img.split('?')[0][-6:-1] + '.jpg'
# 打印一下图片名,便于查看爬取情况
print(img_name)
# 保存图片,同时设置一下间隔时间(也可以不用设置)
with open(path + img_name, 'wb') as f:
f.write(img_content)
time.sleep(random.randint(1, 2))
if __name__ == '__main__':
"""
# 根据每一页的url可以看出https://bing.ioliu.cn/?p=的等号后面加上数字就是对应的页数
# https://bing.ioliu.cn/?p=1,https://bing.ioliu.cn/?p=2,https://bing.ioliu.cn/?p=3
# 使用列表推导式,从第1页爬取到50页(需要爬取的总页数可以调整)
"""
url_list = [f'https://bing.ioliu.cn/?p={x}' for x in range(1, 51)]
# 使用线程池提高爬取速度
with ThreadPoolExecutor(max_workers=8) as executor:
executor.map(get_images, url_list)
# 计时结束,打印任务完成后耗费的总时间
delta = (datetime.datetime.now() - start).total_seconds()
print(f'用时{delta}秒')
提示
代码只是实现简单的图片下载功能,目标url具有反爬机制,代码中并没有反反爬功能,如若copy运行,请自行斟酌。