selenium爬取图片实例

最新推荐文章于 2024-04-07 11:47:37 发布

超哥--

最新推荐文章于 2024-04-07 11:47:37 发布

阅读量4.4k

点赞数 6

文章标签： python selenium

本文链接：https://blog.csdn.net/weixin_50835854/article/details/117170894

版权

项目场景：

最近基本学完了seleninum自动抓取框架，实践是检验真理的唯一标准，还是要在实战中进行提升。

问题描述：

此次没法再原来的爬虫基础上进行改进了，从request到selenium还是非常大的，要为我原来的爬虫进行大换血的改造。有兴趣的可以看我之前代码的python爬取图片到增量式爬虫。

解决方案：

import re
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import os

def hide():
    chrome_options = Options()
    chrome_options.add_argument('--disable-gpu')  # 谷歌文档提到需要加上这个属性来规避bug
    chrome_options.add_argument('--hide-scrollbars')  # 隐藏滚动条, 应对一些特殊页面
    chrome_options.add_argument('blink-settings=imagesEnabled=false')  # 不加载图片, 提升速度
    chrome_options.add_argument('--headless') #无标题浏览
    return chrome_options

def Gethtml(url):
    driver = webdriver.Chrome(options=hide())
    driver.get(url)
    s = driver.find_elements_by_css_selector("div[class='slist'] li a")
    if str(s[-1].get_attribute("href")).split("/")[-2] == "4kmeinv":
        pic=s[:-2]
        for i in pic:
            huoqvpicture(str(i.get_attribute("href")))
        print("翻页")
        Gethtml(str(s[-1].get_attribute("href")))

def huoqvpicture(url):
    driver = webdriver.Chrome(options=hide())
    driver.get(url)
    s=driver.find_elements_by_css_selector("div[class='photo-pic'] a img")
    print(s.get_attribute("title"))
    GetPicture(str(i.get_attribute("src")),str(i.get_attribute("title")))

def GetPicture(url,name):
    root = "./"
    path =root + name+".jpg"
    try:
        if not os.path.exists(root):
            os.mkdir(root)
        if not os.path.exists(path):
            r = requests.get(url)
            with open(path, 'wb') as f:
                f.write(r.content)
                f.close()
                print("文件保存成功")
        else:
            print("文件已存在")
    except:
        print("爬取失败")

def main():
    url="https://pic.netbian.com/4kmeinv/index.html"
    Gethtml(url)

main()

现在按函数分类来逐个分析一下。

import re
import requests
from selenium import webdriver 
from selenium.webdriver.chrome.options import Options
import os

调用需要的库

def hide():
    chrome_options = Options()
    chrome_options.add_argument('--disable-gpu')  # 谷歌文档提到需要加上这个属性来规避bug
    chrome_options.add_argument('--hide-scrollbars')  # 隐藏滚动条, 应对一些特殊页面
    chrome_options.add_argument('blink-settings=imagesEnabled=false')  # 不加载图片, 提升速度
    chrome_options.add_argument('--headless') #无标题浏览
    return chrome_options

这个函数用来隐藏浏览器界面，selenium的原理就是再系统中启动一个浏览器进行拟人的操作，刚开始写selenium不建议隐藏，可以比较清晰的看到爬虫的爬取过程，等整个流程写完了在调用这个函数即可。

def Gethtml(url):
    driver = webdriver.Chrome(options=hide()) #创建无界面浏览，有界面去掉参数即可
    driver.get(url) #访问对应的url（和之前爬虫的网站一样）
    s = driver.find_elements_by_css_selector("div[class='slist'] li a")
    #用css语法获得对应html并形成列表，列表最后一项为下一页
    if str(s[-1].get_attribute("href")).split("/")[-2] == "4kmeinv":
    #如果列表最后一项是下一页（排除最后一页的干扰）
        pic=s[:-2] #获取除最后一项的列表（所有图片的html）
    for i in pic:
    	#遍历每一项进行高清图片下载。
        huoqvpicture(str(i.get_attribute("href")))
    print("翻页")
    #将最后一个url作为参数调用自身，达到翻页的效果
    Gethtml(str(s[-1].get_attribute("href")))

第一个函数的作用主要是获得高清照片对应的url，应为展示界面的图片清晰度都比较低。

def huoqvpicture(url):
    driver = webdriver.Chrome(options=hide())
    driver.get(url)
    s=driver.find_elements_by_css_selector("div[class='photo-pic'] a img")
    #定位jpg文件所在的标签
    print(s.get_attribute("title"))
    #打印照片的名字方便观察爬取进度
    GetPicture(str(i.get_attribute("src")),str(i.get_attribute("title")))
    #将jpg地址和图片名字作为参数调用GetPicture进行保存

第二个函数的作用是获取高清图片的jpg地址和名字

def GetPicture(url,name):
    root = "./"
    path =root + name+".jpg"
    try:
        if not os.path.exists(root):
            os.mkdir(root)
        if not os.path.exists(path):
            r = requests.get(url)
            with open(path, 'wb') as f:
                f.write(r.content)
                f.close()
                print("文件保存成功")
        else:
            print("文件已存在")
    except:
        print("爬取失败")