使用Selenium及Requests多线程批量爬取浏览器图片

haerxiluo

已于 2024-06-07 16:17:10 修改

阅读量111

点赞数

分类专栏： python 爬虫文章标签：爬虫 selenium python

于 2022-07-18 14:27:47 首次发布

本文链接：https://blog.csdn.net/haerxiluo/article/details/125845921

版权

python 爬虫专栏收录该内容

2 篇文章 0 订阅

订阅专栏

文章目录

前言
一、通过selenium定位图片链接
二、requests下载图片
总结

前言

通过对一万张浏览器图片的爬取，快速上手Selenium与Requests两大爬虫工具，本文思路是先通过selenium定位浏览器图片网页链接并将链接存到本地上，再用requests库对本地上的图片链接进行爬取。

一、通过selenium定位图片链接

1.读入要爬取图片的关键字

在这里插入图片描述

读入excel关键字文件代码：

def load_date(file_name):
    with open(file_name) as f:
        reader = csv.reader(f)
        header_row = next(reader)
        content = []
        for row in reader:
            print(row)
            content.append(row)

        return content #返回文件关键字列表

2.定位元素及抓取链接

初始化webdriver对象：

    # 使用代理ip
    options.add_argument('--proxy-server=http://127.0.0.1:10809') 
    wd = webdriver.Chrome(service=Service(r'D:\Study\chromedriver.exe'), options=options)
    wd.implicitly_wait(5)

先进入浏览器图片网站，随便输一个关键字

通过浏览器自带的开发者工具用xpath或者css selector找到输入框（需要输入每个关键字去找图片）以及每张图片的链接，如图所示：
在这里插入图片描述

不难看出，输入框定位可以用：

    element = wd.find_element(By.XPATH, "//*[@id='REsRA']")

图片定位可以用：

    elements = wd.find_elements(By.CSS_SELECTOR, "#islrg > div.islrc > div > a  img")

在往下拉过程中可以发现，浏览器图片不点击更多图片的话，是显示400张图片，这里我们不用点击更多，一个关键字就抓取400张图片。但在用selenium下拉过程中不能直接拉到底，这样会缺失大量获取的图片链接，返回空值，所以需要一截一截的往下拉，慢慢获取图片的链接。代码如下：

    for i in range(32):
        # 获取页面初始高度
        # js = "return action=document.body.scrollHeight"
        # height = wd.execute_script(js)
        # print(height) # 高度为3512
        
        # 将滚动条下拉
        wd.execute_script('window.scrollBy(0, 1000)')
        sleep(5)

抓取过程中，可以用多线程加速抓取：

def multi_thread(content):
    print("multi_thread begin")
    threads = []
    # 创建多线程找到分别获取25个关键词图片链接
    for i in range(25):
        threads.append(
            Thread(target=craw, args=(",".join(content[i]),))
        )

    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()
    print("multi_thread end")

最后将selenium抓取到的链接用txt文件保存到本地

3.完整代码

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from time import sleep
import time
import csv
from threading import Thread


def load_date(file_name):
    with open(file_name) as f:
        reader = csv.reader(f)
        header_row = next(reader)
        content = []
        for row in reader:
            print(row)
            content.append(row)

        return content


def multi_thread(content):
    print("multi_thread begin")
    threads = []
    # 创建多线程找到分别获取25个关键词图片链接
    for i in range(25):
        threads.append(
            Thread(target=craw, args=(",".join(content[i]),))
        )

    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()
    print("multi_thread end")


def craw(key_word):
    prefs = {
        'profile.default_content_setting_values': {
            'notifications': 2    #隐藏chromedriver的通知
        },
        'credentials_enable_service':False,      #隐藏chromedriver自带的保存密码功能
        'profile.password_manager_enabled':False #隐藏chromedriver自带的保存密码功能
    }
    # 创建一个配置对象
    options = webdriver.ChromeOptions()
    options.add_experimental_option('prefs', prefs)
    options.add_experimental_option('excludeSwitches', ['enable-automation'])  # 设置为开发者模式,禁用chrome正受到自动化检测的显示
    # 设置无头模式
    # options.add_argument('--no-sandbox')# 解决DevToolsActivePort文件不存在的报错
    # options.add_argument('--disable-dev-shm-usage')
    # options.add_argument('--disable-gpu')  # 谷歌文档提到需要加上这个属性来规避bug
    # options.add_argument('--headless')

    # 使用代理ip
    options.add_argument('--proxy-server=http://127.0.0.1:10809')
    wd = webdriver.Chrome(service=Service(r'D:\Study\chromedriver.exe'), options=options)
    wd.implicitly_wait(5)

    url = "https://www.google.com/search?q=barber&sxsrf=ALiCzsYUXHmAB-W1-C92ipNPeH_WRVUb0A:1657352656150&source=lnms&tbm=isch&sa=X&ved=2ahUKEwiGiNnip-v4AhVOl2oFHbZEDr4Q_AUoAnoECAMQBA&biw=1536&bih=754&dpr=1.25"
    wd.get(url=url)
    sleep(5)

    element = wd.find_element(By.XPATH, "//*[@id='REsRA']")
    element.clear()
    element.send_keys(key_word + '\n')
    sleep(2)

    # print(wd.page_source)
    # 爬取动态网站必备代码:需要爬取网站大量数据就可以这么做，一直往下拉滚动条使数据显示出来
    for i in range(32):
        # 获取页面初始高度
        # js = "return action=document.body.scrollHeight"
        # height = wd.execute_script(js)
        # print(height) # 高度为3512

        # 将滚动条下拉
        wd.execute_script('window.scrollBy(0, 1000)')
        sleep(5)

    sleep(5)
    elements = wd.find_elements(By.CSS_SELECTOR, "#islrg > div.islrc > div > a  img")
    print("elements's len:")
    print(len(elements))

    f = open(key_word + ".txt", "a", encoding="utf-8")

    for i in range(400):
        print(i)
        href = elements[i].get_attribute('src')
        if href == None:
            print(href)
            continue
        print(href)
        f.write("".join(href) + '\n')
        print('\n')
        if i % 50 == 0:
            sleep(10)
    f.close()

    wd.close()


if __name__ == '__main__':
    file_name = r'C:\Users\HP\Desktop\graduate student\1.csv'
    content = load_date(file_name)
    sleep(2)
    #  将列表list转换成字符串
    # str_content = ",".join(content[0])
    # print(str_content)

    start = time.time()
    multi_thread(content)
    end = time.time()
    print("cost:", end - start)

    # f = open(str_content+'.txt','a',encoding='utf-8')
    # for i in range(25):
    #     f.write(",".join(content[i])+'\n')
    # f.close()

结果如下图所示：
txt文件为下载存放的图片链接

在这里插入图片描述
可以看到下载的链接里不完全是图片的url，还有浏览器缓存的数据，这些数据不能够通过requests库进行抓取，需要进行解码后获得，详细代码看第二部分。

二、requests下载图片

1.获取文件夹下各个txt文件

通过os模块及filter和lambda函数过滤各个txt文件：

 # 读取文件夹下txt文件
    path = r"C:\Users\HP\PycharmProjects\untitled\Selenium\WeekTask"
    # path1 = r"C:\Users\HP\PycharmProjects\untitled\Selenium\WeekTask\图库"

    files = os.listdir(path)
    print(files)
    txt_files = list(filter(lambda x: x[-4:] == '.txt', files))
    print(txt_files)

2.获取图片

代码如下：

# 下载图片并保存到各个文件夹下
def getpicture(file):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
        'Referer': 'https://www.google.com/'
    }

    count = len(open(file, 'r').readlines())
    print(count)

    # 判断以关键字命名的文件夹是否存在，若不存在则创建文件夹
    if file not in os.listdir(path1):
        # print(file.title().split('.')[0])
        os.makedirs(f'{path1}/{file.title().split(".")[0]}')

    i = 0
    with open(file, 'r', encoding='utf-8') as f:

        for url in f:
            print(url)
            if url == None:
                continue

            if "data:image/jpeg" in url:
                # 注意需要删去前端描述字符
                url = url.split(',')[1]
                print(url)
                img = base64.b64decode(url)
                fh = open(os.path.join(f'{path1}/{file.title().split(".")[0]}', str(i) + '.jpg'), "wb")
                fh.write(img)
                fh.close()
                i += 1
                continue

            i += 1
            proxy = {
                "http": "http://127.0.0.1:10809",
                "https": "http://127.0.0.1:10809"
            }

            # proxy = proxy_random()
            response = requests.get(url=url, headers=headers, proxies=proxy, verify=False)

            with open(os.path.join(f'{path1}/{file.title().split(".")[0]}', str(i) + '.jpg'), "wb") as f1:
                f1.write(response.content)
            f1.close()
            # 每爬取一张图片暂停一秒防止ip被封
            sleep(0.25)

    f.close()

3.完整代码及爬取结果

import requests
import os
import linecache
import random
import time
from time import sleep
import base64
from threading import Thread

def multi_thread(content):
    print("multi_thread begin")
    threads = []
    # 创建多线程
    for i in range(25):
        threads.append(
            Thread(target=getpicture,args=("".join(content[i]),))
        )

    for thread in threads:
        thread.start()

    for thread in threads:
        thread.join()
    print("multi_thread end")


# 获取随机代理ip
def proxy_random():
    a = random.randrange(1,150)
    # 从文件1.txt中对读取第a行的数据
    proxy = linecache.getline(r'C:\Users\HP\PycharmProjects\untitled\Requests\1.txt',a).rstrip('\n')
    proxies = {
                "http": "http://" + proxy,
                "https": "http://" + proxy,
    }
    print(proxies)
    return proxies

# 下载图片并保存到各个文件夹下
def getpicture(file):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
        'Referer': 'https://www.google.com/'
    }

    count = len(open(file, 'r').readlines())
    print(count)

    # 判断以关键字命名的文件夹是否存在，若不存在则创建文件夹
    if file not in os.listdir(path1):
        # print(file.title().split('.')[0])
        os.makedirs(f'{path1}/{file.title().split(".")[0]}')

    i = 0
    with open(file, 'r', encoding='utf-8') as f:

        for url in f:
            print(url)
            if url == None:
                continue

            if "data:image/jpeg" in url:
                # 注意需要删去前端描述字符
                url = url.split(',')[1]
                print(url)
                img = base64.b64decode(url)
                fh = open(os.path.join(f'{path1}/{file.title().split(".")[0]}', str(i) + '.jpg'), "wb")
                fh.write(img)
                fh.close()
                i += 1
                continue

            i += 1
            proxy = {
                "http": "http://127.0.0.1:10809",
                "https": "http://127.0.0.1:10809"
            }

            # proxy = proxy_random()
            response = requests.get(url=url, headers=headers, proxies=proxy, verify=False)

            with open(os.path.join(f'{path1}/{file.title().split(".")[0]}', str(i) + '.jpg'), "wb") as f1:
                f1.write(response.content)
            f1.close()
            # 每爬取一张图片暂停一秒防止ip被封
            sleep(0.25)

    f.close()

if __name__ == '__main__':

    # 读取文件夹下txt文件
    path = r"C:\Users\HP\PycharmProjects\untitled\Selenium\WeekTask"
    path1 = r"C:\Users\HP\PycharmProjects\untitled\Selenium\WeekTask\图库"

    files = os.listdir(path)
    print(files)
    txt_files = list(filter(lambda x: x[-4:] == '.txt', files))
    print(txt_files)
    # 测试一个数据集
    # getpicture(txt_files[0])
    start = time.time()
    multi_thread(txt_files)
    end = time.time()
    print("cost:", end - start)