python selenium selenium-wire使用代理

前言

最近使用selenium、selenium-wire爬取数据,在使用代理时查阅很多资料,在使用过程中发现很多资料、博客中都是错误的用法,误导初学selenium使用代理的开发者

描述:

我这里使用的是Python 3.12.2   selenium==4.23.1   selenium-wire==5.1.0

1.selenium使用代理

1.1核心代码

注意这里是python selenium使用代理的方法(原生selenium),亲测selenium-wire不可以这么用,这么用使用代理是不生效的,有些博客上说selenium-wire使用下面的写法不报错,完全是误导大家,selenium-wire使用下面的写法,根本就没使用代理ip(没连接代理ip),又怎么能报错?

建议大家使用稳定的、支持https的代理ip,支持https的代理才能访问https的网站,不要使用免费代理,懂的都懂

写法一

ip_port = '117.86.185.68:8089'  # 这里是你使用的代理ip和端口
options.add_argument(f"--proxy-server={ip_port}")  # options.add_argument("--proxy-server=117.86.185.68:8089")

写法二

ip_port = '117.86.185.68:8089'  # 这里是你使用的代理ip和端口
options.add_argument(f"--proxy-server=http://{ip_port}")  # options.add_argument("--proxy-server=http://117.86.185.68:8089")

1.2检验是否使用代理

try:
    url = "https://httpbin.org/ip"
    driver.get(url)
    print(driver.page_source)
except Exception as e:
    print(e)
finally:
    driver.quit()

1.3完整代码

chrome浏览器(chromium), chrome-win里面包含chrome和chromedriver(个人整理的),浏览器版本和chromedriver版本一致 114.0.5735.90 ,如有需要可自行提取

链接:https://pan.baidu.com/s/1vv6AfmCBFx8QDA7RE2VrIg 
提取码:6666

import os

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# 当前文件所在的文件夹路径
current_path = os.getcwd()
# chrome浏览器路径
chrome_location = os.path.join(current_path, 'chrome-win')
# chrome.exe完整路径
browser_location = os.path.join(chrome_location, 'chrome.exe')
# chromedriver.exe的完整路径
driver_location = os.path.join(chrome_location, 'chromedriver.exe')
# 创建一个Servic对象,传入ChromeDriver的路径
service = Service(driver_location)
# 创建Chrome选项
options = Options()
# options.add_argument("--headless") # 无头模式(无界面)
# 禁用图片
# options.add_argument('blink-settings=imagesEnabled=false')
# option设置,传入Chrome浏览器的路径(chrome.exe完整路径)
options.binary_location = browser_location
# 不显示 Chrom正受到自动测试软件控制
options.add_experimental_option("excludeSwitches", ['enable-automation'])
ip_port = '117.86.185.68:8089'  # 这里是你使用的代理ip和端口
options.add_argument(f"--proxy-server={ip_port}")  # options.add_argument("--proxy-server=117.86.185.68:8089")
# 创建 Chrome 浏览器驱动对象
driver = webdriver.Chrome(service=service, options=options)
# driver.set_page_load_timeout(15) # 设置页面加载超时时间为15秒
# driver.set_script_timeout(15)  # 设置js加载超时时间为15秒
try:
    url = "https://httpbin.org/ip"
    driver.get(url)
    print(driver.page_source)
except Exception as e:
    print(e)
finally:
    driver.quit()

2.selenium-wire使用代理

2.1核心代码

方式一(浏览器启动前配置)

seleniumwire_options={
    'proxy': {
        'http': 'http://180.127.3.147:8090', # 这里使用自己的代理ip和端口
        'https': 'https://180.127.3.147:8090',# 这里使用自己的代理ip和端口
        'no_proxy': 'localhost,127.0.0.1'
    }
}
# 创建 Chrome 浏览器驱动对象
driver = webdriver.Chrome(service=service, options=options, seleniumwire_options=seleniumwire_options)

方式二(动态切换)

# 创建 Chrome 浏览器驱动对象
driver = webdriver.Chrome(service=service, options=options)
# selenium-wire使用代理方式二(动态切换)
driver.proxy = {
    'http': 'http://180.127.3.147:8090',
    'https': 'https://180.127.3.147:8090'
}

 2.2检验是否使用代理

try:
    url = "https://httpbin.org/ip"
    driver.get(url)
    print(driver.page_source)
except Exception as e:
    print(e)
finally:
    driver.quit()

2.3完整代码 

import os

from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# 当前文件所在的文件夹路径
current_path = os.getcwd()
# chrome浏览器路径
chrome_location = os.path.join(current_path, 'chrome-win')
# chrome.exe完整路径
browser_location = os.path.join(chrome_location, 'chrome.exe')
# chromedriver.exe的完整路径
driver_location = os.path.join(chrome_location, 'chromedriver.exe')
# 创建一个Servic对象,传入ChromeDriver的路径
service = Service(driver_location)
# 创建Chrome选项
options = Options()
# options.add_argument("--headless") # 无头模式(无界面)
# 禁用图片
# options.add_argument('blink-settings=imagesEnabled=false')
# option设置,传入Chrome浏览器的路径(chrome.exe完整路径)
options.binary_location = browser_location
# 不显示 Chrom正受到自动测试软件控制
options.add_experimental_option("excludeSwitches", ['enable-automation'])
# # selenium-wire使用代理方式一
# seleniumwire_options = {
#     'proxy': {
#         'http': 'http://180.127.3.147:8090',  # 这里使用自己的代理ip和端口
#         'https': 'https://180.127.3.147:8090',  # 这里使用自己的代理ip和端口
#         'no_proxy': 'localhost,127.0.0.1'
#     }
# }
# 创建 Chrome 浏览器驱动对象
driver = webdriver.Chrome(service=service, options=options)
# selenium-wire使用代理方式二(动态切换)
driver.proxy = {
    'http': 'http://180.127.3.147:8090',
    'https': 'https://180.127.3.147:8090'
}
# driver.set_page_load_timeout(15) # 设置页面加载超时时间为15秒
# driver.set_script_timeout(15)  # 设置js加载超时时间为15秒
try:
    url = "https://httpbin.org/ip"
    driver.get(url)
    print(driver.page_source)
except Exception as e:
    print(e)
finally:
    driver.quit()

3.使用代理插件Selenium-Chrome-HTTP-Private-Proxy

3.1创建插件的方法(此方法可以封装在工具类里使用)

import string
import zipfile


# 创建chrome浏览器插件的方法
def create_proxyauth_extension(proxy_host, proxy_port, proxy_username, proxy_password, scheme='http', plugin_path=None):
    """Proxy Auth Extension
    args:
        proxy_host (str): domain or ip address, ie proxy.domain.com
        proxy_port (int): port
        proxy_username (str): auth username
        proxy_password (str): auth password
    kwargs:
        scheme (str): proxy scheme, default http
        plugin_path (str): absolute path of the extension
    return str -> plugin_path
    """
    if plugin_path is None:
        plugin_path = 'Selenium-Chrome-HTTP-Private-Proxy.zip'
    manifest_json = """
    {
        "version": "1.0.0",
        "manifest_version": 2,
        "name": "Chrome Proxy",
        "permissions": [
            "proxy",
            "tabs",
            "unlimitedStorage",
            "storage",
            "<all_urls>",
            "webRequest",
            "webRequestBlocking"
        ],
        "background": {
            "scripts": ["background.js"]
        },
        "minimum_chrome_version":"22.0.0"
    }
    """
    background_js = string.Template(
        """
        var config = {
                mode: "fixed_servers",
                rules: {
                  singleProxy: {
                    scheme: "${scheme}",
                    host: "${host}",
                    port: parseInt(${port})
                  },
                  bypassList: ["foobar.com"]
                }
              };
        chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
        function callbackFn(details) {
            return {
                authCredentials: {
                    username: "${username}",
                    password: "${password}"
                }
            };
        }
        chrome.webRequest.onAuthRequired.addListener(
                    callbackFn,
                    {urls: ["<all_urls>"]},
                    ['blocking']
        );
        """
    ).substitute(
        host=proxy_host,
        port=proxy_port,
        username=proxy_username,
        password=proxy_password,
        scheme=scheme,
    )
    with zipfile.ZipFile(plugin_path, 'w') as zp:
        zp.writestr("manifest.json", manifest_json)
        zp.writestr("background.js", background_js)

    return plugin_path

 3.2核心代码

proxy_config = ["125.112.183.182", "8090", "", ""]
proxyauth_plugin_path = create_proxyauth_extension(
        proxy_host=proxy_config[0],
        proxy_port=proxy_config[1],
        proxy_username=proxy_config[2],
        proxy_password=proxy_config[3]
    )
# 浏览器添加扩展插件
options.add_extension(proxyauth_plugin_path)
driver = webdriver.Chrome(service=service, options=options)

3.3 完整代码

我这里为方便演示,创建浏览插件的方法(create_proxyauth_extension)就写在一起了,建议create_proxyauth_extension方法封装成一个工具类来调用,可以提高代码的可阅读性和整洁性

selenium和selenium-wire使用浏览器代理插件用法是一样

import os
import string
import zipfile

# from seleniumwire import webdriver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# 创建chrome浏览器插件的方法
def create_proxyauth_extension(proxy_host, proxy_port, proxy_username, proxy_password, scheme='http', plugin_path=None):
    """Proxy Auth Extension
    args:
        proxy_host (str): domain or ip address, ie proxy.domain.com
        proxy_port (int): port
        proxy_username (str): auth username
        proxy_password (str): auth password
    kwargs:
        scheme (str): proxy scheme, default http
        plugin_path (str): absolute path of the extension
    return str -> plugin_path
    """
    if plugin_path is None:
        plugin_path = 'Selenium-Chrome-HTTP-Private-Proxy.zip'
    manifest_json = """
    {
        "version": "1.0.0",
        "manifest_version": 2,
        "name": "Chrome Proxy",
        "permissions": [
            "proxy",
            "tabs",
            "unlimitedStorage",
            "storage",
            "<all_urls>",
            "webRequest",
            "webRequestBlocking"
        ],
        "background": {
            "scripts": ["background.js"]
        },
        "minimum_chrome_version":"22.0.0"
    }
    """
    background_js = string.Template(
        """
        var config = {
                mode: "fixed_servers",
                rules: {
                  singleProxy: {
                    scheme: "${scheme}",
                    host: "${host}",
                    port: parseInt(${port})
                  },
                  bypassList: ["foobar.com"]
                }
              };
        chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
        function callbackFn(details) {
            return {
                authCredentials: {
                    username: "${username}",
                    password: "${password}"
                }
            };
        }
        chrome.webRequest.onAuthRequired.addListener(
                    callbackFn,
                    {urls: ["<all_urls>"]},
                    ['blocking']
        );
        """
    ).substitute(
        host=proxy_host,
        port=proxy_port,
        username=proxy_username,
        password=proxy_password,
        scheme=scheme,
    )
    with zipfile.ZipFile(plugin_path, 'w') as zp:
        zp.writestr("manifest.json", manifest_json)
        zp.writestr("background.js", background_js)

    return plugin_path


current_path = os.getcwd()  # 当前文件所在的文件夹路径
# 指定谷歌的位置
chrome_location = os.path.join(current_path, 'chrome-win')
# chrome.exe完整路径
browser_location = os.path.join(chrome_location, 'chrome.exe')
# ChromeDriver的完整路径
driver_location = os.path.join(chrome_location, 'chromedriver.exe')
# 创建一个Servic对象,传入ChromeDriver的路径
service = Service(driver_location)
# 创建Chrome选项
options = Options()
# option设置,传入Chrome浏览器的路径
options.binary_location = browser_location
# 不显示 Chrom正受到自动测试软件控制
options.add_experimental_option("excludeSwitches", ['enable-automation'])

# 代理ip,端口,账号,密码,有账号密码的就填写账号密码,没有空着即可
proxy_config = ["125.112.183.182", "8090", "", ""]
proxyauth_plugin_path = create_proxyauth_extension(
        proxy_host=proxy_config[0],
        proxy_port=proxy_config[1],
        proxy_username=proxy_config[2],
        proxy_password=proxy_config[3]
    )
# 浏览器添加扩展插件
options.add_extension(proxyauth_plugin_path)
driver = webdriver.Chrome(service=service, options=options)
# driver.set_page_load_timeout(10)
# driver.set_script_timeout(8)
try:
    url = "https://httpbin.org/ip"
    driver.get(url)
    print(driver.page_source)
except Exception as e:
    print(e)
finally:
    driver.quit()

 

4.加上代理后(关闭webrtc)

其实你加上了代理也等于白加了 因为网页使用了一种叫WebRTC的技术识别出了你本地电脑真实IP 

4.1使用完代理后查看是否关闭了WebRTC 

访问:https://browserleaks.com/ip 查看是否关闭了webrtc

url = "https://browserleaks.com/ip"
driver.get(url)

 

 此时WebRTC显示出本地电脑的真实ip,说明未关闭WebRTC

 4.2 selenium/seleniumwire关闭WebRTC 

# 关闭webrtc
preferences = {
        "webrtc.ip_handling_policy": "disable_non_proxied_udp",
        "webrtc.multiple_routes_enabled": False,
        "webrtc.nonproxied_udp_enabled": False
}
options.add_experimental_option("prefs", preferences)
driver = webdriver.Chrome(service=service, options=options)

 关闭后访问上面链接检测

源代码地址:https://gitee.com/jxzcode_admin/flask-project.git

参考资料 

https://pypi.org/project/selenium-wire/#socks

https://blog.csdn.net/zwq912318834/article/details/78626739

https://www.cnblogs.com/roystime/p/6935543.html

https://zhuanlan.zhihu.com/p/296853352

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值