前言
最近使用selenium、selenium-wire爬取数据,在使用代理时查阅很多资料,在使用过程中发现很多资料、博客中都是错误的用法,误导初学selenium使用代理的开发者
描述:
我这里使用的是Python 3.12.2 selenium==4.23.1 selenium-wire==5.1.0
1.selenium使用代理
1.1核心代码
注意这里是python selenium使用代理的方法(原生selenium),亲测selenium-wire不可以这么用,这么用使用代理是不生效的,有些博客上说selenium-wire使用下面的写法不报错,完全是误导大家,selenium-wire使用下面的写法,根本就没使用代理ip(没连接代理ip),又怎么能报错?
建议大家使用稳定的、支持https的代理ip,支持https的代理才能访问https的网站,不要使用免费代理,懂的都懂
写法一
ip_port = '117.86.185.68:8089' # 这里是你使用的代理ip和端口
options.add_argument(f"--proxy-server={ip_port}") # options.add_argument("--proxy-server=117.86.185.68:8089")
写法二
ip_port = '117.86.185.68:8089' # 这里是你使用的代理ip和端口
options.add_argument(f"--proxy-server=http://{ip_port}") # options.add_argument("--proxy-server=http://117.86.185.68:8089")
1.2检验是否使用代理
try:
url = "https://httpbin.org/ip"
driver.get(url)
print(driver.page_source)
except Exception as e:
print(e)
finally:
driver.quit()
1.3完整代码
chrome浏览器(chromium), chrome-win里面包含chrome和chromedriver(个人整理的),浏览器版本和chromedriver版本一致 114.0.5735.90 ,如有需要可自行提取
链接:https://pan.baidu.com/s/1vv6AfmCBFx8QDA7RE2VrIg
提取码:6666
import os
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
# 当前文件所在的文件夹路径
current_path = os.getcwd()
# chrome浏览器路径
chrome_location = os.path.join(current_path, 'chrome-win')
# chrome.exe完整路径
browser_location = os.path.join(chrome_location, 'chrome.exe')
# chromedriver.exe的完整路径
driver_location = os.path.join(chrome_location, 'chromedriver.exe')
# 创建一个Servic对象,传入ChromeDriver的路径
service = Service(driver_location)
# 创建Chrome选项
options = Options()
# options.add_argument("--headless") # 无头模式(无界面)
# 禁用图片
# options.add_argument('blink-settings=imagesEnabled=false')
# option设置,传入Chrome浏览器的路径(chrome.exe完整路径)
options.binary_location = browser_location
# 不显示 Chrom正受到自动测试软件控制
options.add_experimental_option("excludeSwitches", ['enable-automation'])
ip_port = '117.86.185.68:8089' # 这里是你使用的代理ip和端口
options.add_argument(f"--proxy-server={ip_port}") # options.add_argument("--proxy-server=117.86.185.68:8089")
# 创建 Chrome 浏览器驱动对象
driver = webdriver.Chrome(service=service, options=options)
# driver.set_page_load_timeout(15) # 设置页面加载超时时间为15秒
# driver.set_script_timeout(15) # 设置js加载超时时间为15秒
try:
url = "https://httpbin.org/ip"
driver.get(url)
print(driver.page_source)
except Exception as e:
print(e)
finally:
driver.quit()
2.selenium-wire使用代理
2.1核心代码
方式一(浏览器启动前配置)
seleniumwire_options={
'proxy': {
'http': 'http://180.127.3.147:8090', # 这里使用自己的代理ip和端口
'https': 'https://180.127.3.147:8090',# 这里使用自己的代理ip和端口
'no_proxy': 'localhost,127.0.0.1'
}
}
# 创建 Chrome 浏览器驱动对象
driver = webdriver.Chrome(service=service, options=options, seleniumwire_options=seleniumwire_options)
方式二(动态切换)
# 创建 Chrome 浏览器驱动对象
driver = webdriver.Chrome(service=service, options=options)
# selenium-wire使用代理方式二(动态切换)
driver.proxy = {
'http': 'http://180.127.3.147:8090',
'https': 'https://180.127.3.147:8090'
}
2.2检验是否使用代理
try:
url = "https://httpbin.org/ip"
driver.get(url)
print(driver.page_source)
except Exception as e:
print(e)
finally:
driver.quit()
2.3完整代码
import os
from seleniumwire import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
# 当前文件所在的文件夹路径
current_path = os.getcwd()
# chrome浏览器路径
chrome_location = os.path.join(current_path, 'chrome-win')
# chrome.exe完整路径
browser_location = os.path.join(chrome_location, 'chrome.exe')
# chromedriver.exe的完整路径
driver_location = os.path.join(chrome_location, 'chromedriver.exe')
# 创建一个Servic对象,传入ChromeDriver的路径
service = Service(driver_location)
# 创建Chrome选项
options = Options()
# options.add_argument("--headless") # 无头模式(无界面)
# 禁用图片
# options.add_argument('blink-settings=imagesEnabled=false')
# option设置,传入Chrome浏览器的路径(chrome.exe完整路径)
options.binary_location = browser_location
# 不显示 Chrom正受到自动测试软件控制
options.add_experimental_option("excludeSwitches", ['enable-automation'])
# # selenium-wire使用代理方式一
# seleniumwire_options = {
# 'proxy': {
# 'http': 'http://180.127.3.147:8090', # 这里使用自己的代理ip和端口
# 'https': 'https://180.127.3.147:8090', # 这里使用自己的代理ip和端口
# 'no_proxy': 'localhost,127.0.0.1'
# }
# }
# 创建 Chrome 浏览器驱动对象
driver = webdriver.Chrome(service=service, options=options)
# selenium-wire使用代理方式二(动态切换)
driver.proxy = {
'http': 'http://180.127.3.147:8090',
'https': 'https://180.127.3.147:8090'
}
# driver.set_page_load_timeout(15) # 设置页面加载超时时间为15秒
# driver.set_script_timeout(15) # 设置js加载超时时间为15秒
try:
url = "https://httpbin.org/ip"
driver.get(url)
print(driver.page_source)
except Exception as e:
print(e)
finally:
driver.quit()
3.使用代理插件Selenium-Chrome-HTTP-Private-Proxy
3.1创建插件的方法(此方法可以封装在工具类里使用)
import string
import zipfile
# 创建chrome浏览器插件的方法
def create_proxyauth_extension(proxy_host, proxy_port, proxy_username, proxy_password, scheme='http', plugin_path=None):
"""Proxy Auth Extension
args:
proxy_host (str): domain or ip address, ie proxy.domain.com
proxy_port (int): port
proxy_username (str): auth username
proxy_password (str): auth password
kwargs:
scheme (str): proxy scheme, default http
plugin_path (str): absolute path of the extension
return str -> plugin_path
"""
if plugin_path is None:
plugin_path = 'Selenium-Chrome-HTTP-Private-Proxy.zip'
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": [
"proxy",
"tabs",
"unlimitedStorage",
"storage",
"<all_urls>",
"webRequest",
"webRequestBlocking"
],
"background": {
"scripts": ["background.js"]
},
"minimum_chrome_version":"22.0.0"
}
"""
background_js = string.Template(
"""
var config = {
mode: "fixed_servers",
rules: {
singleProxy: {
scheme: "${scheme}",
host: "${host}",
port: parseInt(${port})
},
bypassList: ["foobar.com"]
}
};
chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
function callbackFn(details) {
return {
authCredentials: {
username: "${username}",
password: "${password}"
}
};
}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{urls: ["<all_urls>"]},
['blocking']
);
"""
).substitute(
host=proxy_host,
port=proxy_port,
username=proxy_username,
password=proxy_password,
scheme=scheme,
)
with zipfile.ZipFile(plugin_path, 'w') as zp:
zp.writestr("manifest.json", manifest_json)
zp.writestr("background.js", background_js)
return plugin_path
3.2核心代码
proxy_config = ["125.112.183.182", "8090", "", ""]
proxyauth_plugin_path = create_proxyauth_extension(
proxy_host=proxy_config[0],
proxy_port=proxy_config[1],
proxy_username=proxy_config[2],
proxy_password=proxy_config[3]
)
# 浏览器添加扩展插件
options.add_extension(proxyauth_plugin_path)
driver = webdriver.Chrome(service=service, options=options)
3.3 完整代码
我这里为方便演示,创建浏览插件的方法(create_proxyauth_extension)就写在一起了,建议create_proxyauth_extension方法封装成一个工具类来调用,可以提高代码的可阅读性和整洁性
selenium和selenium-wire使用浏览器代理插件用法是一样
import os
import string
import zipfile
# from seleniumwire import webdriver
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
# 创建chrome浏览器插件的方法
def create_proxyauth_extension(proxy_host, proxy_port, proxy_username, proxy_password, scheme='http', plugin_path=None):
"""Proxy Auth Extension
args:
proxy_host (str): domain or ip address, ie proxy.domain.com
proxy_port (int): port
proxy_username (str): auth username
proxy_password (str): auth password
kwargs:
scheme (str): proxy scheme, default http
plugin_path (str): absolute path of the extension
return str -> plugin_path
"""
if plugin_path is None:
plugin_path = 'Selenium-Chrome-HTTP-Private-Proxy.zip'
manifest_json = """
{
"version": "1.0.0",
"manifest_version": 2,
"name": "Chrome Proxy",
"permissions": [
"proxy",
"tabs",
"unlimitedStorage",
"storage",
"<all_urls>",
"webRequest",
"webRequestBlocking"
],
"background": {
"scripts": ["background.js"]
},
"minimum_chrome_version":"22.0.0"
}
"""
background_js = string.Template(
"""
var config = {
mode: "fixed_servers",
rules: {
singleProxy: {
scheme: "${scheme}",
host: "${host}",
port: parseInt(${port})
},
bypassList: ["foobar.com"]
}
};
chrome.proxy.settings.set({value: config, scope: "regular"}, function() {});
function callbackFn(details) {
return {
authCredentials: {
username: "${username}",
password: "${password}"
}
};
}
chrome.webRequest.onAuthRequired.addListener(
callbackFn,
{urls: ["<all_urls>"]},
['blocking']
);
"""
).substitute(
host=proxy_host,
port=proxy_port,
username=proxy_username,
password=proxy_password,
scheme=scheme,
)
with zipfile.ZipFile(plugin_path, 'w') as zp:
zp.writestr("manifest.json", manifest_json)
zp.writestr("background.js", background_js)
return plugin_path
current_path = os.getcwd() # 当前文件所在的文件夹路径
# 指定谷歌的位置
chrome_location = os.path.join(current_path, 'chrome-win')
# chrome.exe完整路径
browser_location = os.path.join(chrome_location, 'chrome.exe')
# ChromeDriver的完整路径
driver_location = os.path.join(chrome_location, 'chromedriver.exe')
# 创建一个Servic对象,传入ChromeDriver的路径
service = Service(driver_location)
# 创建Chrome选项
options = Options()
# option设置,传入Chrome浏览器的路径
options.binary_location = browser_location
# 不显示 Chrom正受到自动测试软件控制
options.add_experimental_option("excludeSwitches", ['enable-automation'])
# 代理ip,端口,账号,密码,有账号密码的就填写账号密码,没有空着即可
proxy_config = ["125.112.183.182", "8090", "", ""]
proxyauth_plugin_path = create_proxyauth_extension(
proxy_host=proxy_config[0],
proxy_port=proxy_config[1],
proxy_username=proxy_config[2],
proxy_password=proxy_config[3]
)
# 浏览器添加扩展插件
options.add_extension(proxyauth_plugin_path)
driver = webdriver.Chrome(service=service, options=options)
# driver.set_page_load_timeout(10)
# driver.set_script_timeout(8)
try:
url = "https://httpbin.org/ip"
driver.get(url)
print(driver.page_source)
except Exception as e:
print(e)
finally:
driver.quit()
4.加上代理后(关闭webrtc)
其实你加上了代理也等于白加了 因为网页使用了一种叫WebRTC的技术识别出了你本地电脑真实IP
4.1使用完代理后查看是否关闭了WebRTC
访问:https://browserleaks.com/ip 查看是否关闭了webrtc
url = "https://browserleaks.com/ip"
driver.get(url)
此时WebRTC显示出本地电脑的真实ip,说明未关闭WebRTC
4.2 selenium/seleniumwire关闭WebRTC
# 关闭webrtc
preferences = {
"webrtc.ip_handling_policy": "disable_non_proxied_udp",
"webrtc.multiple_routes_enabled": False,
"webrtc.nonproxied_udp_enabled": False
}
options.add_experimental_option("prefs", preferences)
driver = webdriver.Chrome(service=service, options=options)
关闭后访问上面链接检测
源代码地址:https://gitee.com/jxzcode_admin/flask-project.git
参考资料
https://pypi.org/project/selenium-wire/#socks
https://blog.csdn.net/zwq912318834/article/details/78626739