From:https://blog.csdn.net/qq_42196922/article/details/89400988
多加一行代码,突破淘宝模拟登录滑块:http://www.imooc.com/article/285729
爬虫自动化:https://www.jianshu.com/p/b3b92f327374
selenium 跳过 webdriver 检测并模拟登录淘宝:https://www.cnblogs.com/cloudbird/p/10524242.html
方法 1:利用 Chrome DevTools 协议
Chrome DevTools Protocol (协议详细内容):https://chromedevtools.github.io/devtools-protocol/
之前淘宝对于 selenium 还是很友好的,后来 selenium 被检测了 window.navigator.webdriver 等参数,出滑动验证码什么的,selenium 已经很难用了, 网上大片教程都使用的 pyppeteer 修改检测 js 参数去采集, 但是发现chromium 占用内存太高,并且 pyppeteer 参数方法介绍太少,用起来不舒服。
本文介绍了另一种方法:使用 selenium 接管 chrome 浏览器
利用 Chrome DevTools 协议。它允许客户 检查 和 调试 Chrome 浏览器。
添加 chrome 的环境变量
在 系统环境变量 PATH 里将 chrome的路径 添加进去。
命令行下执行命令
打开cmd,在命令行中输入命令:
chrome.exe --remote-debugging-port=9999 --user-data-dir="C:\selenum\AutomationProfile"
对于-remote-debugging-port值,可以指定任何打开的端口。
对于-user-data-dir 标记,指定创建新 Chrome 配置文件的目录。它是为了确保在单独的配置文件中启动 chrome,不会污染你的默认配置文件。
执行完命令后,会打开一个浏览器页面,我们输入淘宝网址(https://login.taobao.com/member/login.jhtml),输入用户名和密码,登录淘宝后用户信息就保存在 --user-data-dir="C:\selenum\AutomationProfile" 所指定的文件夹中。
执行 js window.open() 打不开窗口时,是因为 chrome 默认不允许弹出窗口,改下 chrome 设置就可以了
在 chrome 浏览器地址栏输入:chrome://settings/content/popups,把 已阻止(推荐) 改成 允许 即可。
或者 chrome -》设置 -》高级 -》隐私设置和安全性 -》网站设置 -》弹出式窗口和重定向,也可以设置。
不要关闭上面浏览器,然后执行 python 代码
python 代码:
在淘宝搜索 "电脑" 关键字,并打印前 5 页 所有 搜索内容
-
import os
-
import time
-
import random
-
from selenium
import webdriver
-
from selenium.webdriver.chrome.options
import Options
-
from selenium.webdriver.support.ui
import WebDriverWait
-
from selenium.webdriver.common.by
import By
-
from selenium.webdriver.support
import expected_conditions
as EC
-
-
# from selenium.webdriver.common.action_chains import ActionChains
-
-
-
def main():
-
# os.system(r'C:\Users\Administrator\AppData\Local\Google\Chrome\Application/chrome.exe --remote-debugging-port=9999 --user-data-dir="C:\selenum\AutomationProfile"')
-
chrome_debug_port =
9999
-
chrome_options = Options()
-
# chrome_options.add_argument('--headless')
-
chrome_options.add_experimental_option(
"debuggerAddress",
f"127.0.0.1:{chrome_debug_port}")
-
-
browser = webdriver.Chrome(chrome_options=chrome_options)
-
wait = WebDriverWait(browser,
5)
-
print(browser.title)
-
-
# 当前句柄
-
current_handle = browser.current_window_handle
-
-
# browser.execute_script('window.open("https://login.taobao.com/member/login.jhtml")')
-
browser.execute_script(
'window.open("http://www.baidu.com")')
-
-
# 所有句柄
-
all_handle = browser.window_handles
-
second_handle = all_handle[
-1]
-
-
# 切回first
-
browser.switch_to.window(current_handle)
-
-
url =
'https://s.taobao.com/search?q=电脑'
-
browser.get(url)
-
-
produce_info_xpath =
'//div[contains(@class, "J_MouserOnverReq")]//div[@class="row row-2 title"]/a'
-
produce_info = browser.find_elements_by_xpath(produce_info_xpath)
-
for produce
in produce_info:
-
print(produce.text.replace(
' ',
''))
-
-
# 这里是演示,所以只爬了前 5 页
-
for page_num
in range(
2,
6):
-
next_page_xpath =
'//li[@class="item next"]'
-
next_page = browser.find_element_by_xpath(next_page_xpath)
-
next_page_enable =
False
if
'disabled'
in next_page.get_attribute(
'class')
else
True
-
if next_page_enable:
-
print(
'*' *
100)
-
print(
f'第 {page_num} 页')
-
next_page.click()
-
# browser.refresh()
-
produce_info_xpath =
'//div[contains(@class, "J_MouserOnverReq")]//div[@class="row row-2 title"]/a'
-
-
wait.until(EC.presence_of_all_elements_located((By.XPATH, produce_info_xpath)))
-
time.sleep(random.randint(
3,
5))
-
produce_info = browser.find_elements_by_xpath(produce_info_xpath)
-
for produce
in produce_info:
-
print(produce.text.replace(
' ',
''))
-
else:
-
break
-
-
-
if __name__ ==
'__main__':
-
main()
-
执行结果截图:
代码 2(根据关键字搜索,然后抓取 店铺名,店铺地址,店铺电话,):
-
# -*- coding: utf-8 -*-
-
-
-
import time
-
import random
-
import parsel
-
import re
-
from selenium
import webdriver
-
from selenium.webdriver.chrome.options
import Options
-
from selenium.webdriver.support.ui
import WebDriverWait
-
from selenium.webdriver.common.by
import By
-
from selenium.webdriver.support
import expected_conditions
as EC
-
-
-
# from selenium.webdriver.common.action_chains import ActionChains
-
-
class TaoBaoSearch(object):
-
def __init__(self):
-
super(TaoBaoSearch, self).__init__()
-
self.browser =
None
-
self.wait =
None
-
self.master_handler =
None
-
self.slaver_handler =
None
-
self.temp =
None
-
self.browser_init()
-
-
def browser_init(self):
-
chrome_debug_port =
9999
-
chrome_options = Options()
-
chrome_options.add_experimental_option(
"debuggerAddress",
f"127.0.0.1:{chrome_debug_port}")
-
# chrome_options.add_argument('--headless')
-
-
self.browser = webdriver.Chrome(chrome_options=chrome_options)
-
self.wait = WebDriverWait(self.browser,
5)
-
-
all_handler = self.browser.window_handles
-
if len(all_handler) >=
1:
-
for index
in all_handler[
1:]:
-
self.browser.switch_to.window(index)
-
self.browser.close()
-
-
# self.master_handler = self.browser.current_window_handle
-
self.master_handler = self.browser.window_handles[
0]
-
-
self.browser.switch_to.window(self.master_handler)
-
self.browser.execute_script(
'window.open()')
-
# self.browser.execute_script('window.open("_blank")')
-
handlers = self.browser.window_handles
-
self.slaver_handler = handlers[
-1]
-
# print(self.browser.title)
-
-
def get_detail_info(self, shop_url=None):
-
# 切换到 从 窗口
-
self.browser.switch_to.window(self.slaver_handler)
-
self.browser.get(shop_url)
-
html = self.browser.page_source
-
html = html.replace(
'<',
'<').replace(
'>',
'>')
-
# print(html)
-
s_html = parsel.Selector(text=html)
-
shop_keeper_xpath =
'//div[@class="extend"]//li[@class="shopkeeper"]//a/text()'
-
shop_keeper = s_html.xpath(shop_keeper_xpath).extract_first()
-
-
phone_reg =
'联系电话:(\d+-?\d+)|联系手机:(\d+)'
-
phone = re.findall(phone_reg, html)
-
# 处理完后 一定要切换到 主 窗口
-
self.browser.switch_to.window(self.master_handler)
-
return shop_keeper, phone
-
-
def process_item(self, item):
-
self.temp =
None
-
shop_xpath =
'.//div[@class="shop"]//a'
-
local_xpath =
'.//div[@class="location"]'
-
shop = item.find_element_by_xpath(shop_xpath).text
-
shop_url = item.find_element_by_xpath(shop_xpath).get_attribute(
'href')
-
local = item.find_element_by_xpath(local_xpath).text
-
shop_keeper, phone = self.get_detail_info(shop_url)
-
if phone:
-
print(
f'shop : {shop}')
-
print(
f'local : {local}')
-
print(
f'shop_url : {shop_url}')
-
print(
f'shop_keeper : {shop_keeper}')
-
print(
f'phone : {phone}')
-
with open(
'./info.txt',
'a+')
as f:
-
f.write(shop +
',')
-
f.write(local +
',')
-
f.write(shop_url +
',')
-
f.write(shop_keeper +
',')
-
f.write(
f'{phone}')
-
f.write(
'\n')
-
-
def main(self):
-
# 切回 主 窗口
-
self.browser.switch_to.window(self.master_handler)
-
key_word = input(
'输入淘宝搜索关键字:')
-
if
not key_word:
-
print(
'没有输入关键字。默认搜索 “手机”')
-
key_word =
'手机'
-
url =
f'https://s.taobao.com/search?q={key_word}'
-
self.browser.get(url)
-
shop_and_local_xpath =
'//div[contains(@class, "J_MouserOnverReq")]//div[@class="row row-3 g-clearfix"]'
-
shop_and_local = self.browser.find_elements_by_xpath(shop_and_local_xpath)
-
for item
in shop_and_local:
-
self.process_item(item)
-
-
# 这里是演示,所以只爬了前 5 页
-
for page_num
in range(
2,
6):
-
next_page_xpath =
'//li[@class="item next"]'
-
next_page = self.browser.find_element_by_xpath(next_page_xpath)
-
next_page_enable =
False
if
'disabled'
in next_page.get_attribute(
'class')
else
True
-
if next_page_enable:
-
print(
'*' *
100)
-
print(
f'第 {page_num} 页')
-
next_page.click()
-
# self.browser.refresh()
-
self.wait.until(EC.presence_of_all_elements_located((By.XPATH, shop_and_local_xpath)))
-
time.sleep(random.randint(
3,
5))
-
shop_and_local = self.browser.find_elements_by_xpath(shop_and_local_xpath)
-
for item
in shop_and_local:
-
self.process_item(item)
-
else:
-
break
-
-
-
if __name__ ==
'__main__':
-
tb = TaoBaoSearch()
-
tb.main()
抓取信息保存到 info.txt ,文件截图:
改进:
上面是一直有浏览器窗口的,没法使用 无头模式,可以使用 --user-data-dir 参数,然后设置无头模式。
如果想改变 Chrome 位置,可以设置 chrome_options.binary_location 为 chrome.exe 路径即可。
-
from selenium
import webdriver
-
from selenium.webdriver.chrome.options
import Options
-
-
if __name__ ==
'__main__':
-
-
chrome_options = Options()
-
-
# 不使用默认的Chrome安装版本时,可以设置binary_location 指定 Chrome 路径 。
-
# chrome 和 Chromium 对应 chromedriver.exe 版本不一样
-
chrome_options.binary_location =
r'D:\chrome\chrome.exe'
-
# chrome_options.binary_location = r'D:\Chromium\chrome.exe'
-
-
# chrome_options.add_argument('--headless')
-
chrome_options.add_argument(
"--no-sandbox")
-
chrome_options.add_argument(
'disable-infobars')
-
chrome_options.add_argument(
r'--user-data-dir=D:\chrome\userdatadir')
-
# chrome_options.add_argument(r'--user-data-dir=D:\Chromium\userdatadir')
-
-
browser = webdriver.Chrome(
-
chrome_options=chrome_options,
-
executable_path=
r'D:\chrome\chromedriver.exe'
-
# executable_path=r'D:\Chromium\chromedriver.exe'
-
)
-
-
browser.get(
'https://www.taobao.com/')
-
user_name_xpath =
'//div[@class="site-nav-user"]/a'
-
user_name = browser.find_element_by_xpath(user_name_xpath).text
-
print(user_name)
-
结果截图:
可以看到 无头模式下,使用 --user-data-dir 参数,可以登录淘宝。前提需要先手动登录淘宝,拿到登录信息的文件夹。
方法 2:js 注入,修改浏览器特征
执行代码后,手动输入用户名和密码,滑动滑块,可以正常跳转到登录后个人页面。
提示:这个手动滑动滑块有一定的失败几率,有时候失败几率还很高。有时一次就可以滑过,有时好多次都过不去。
示例代码:
-
import asyncio
-
from pyppeteer
import launch
-
-
width, height =
1366,
768
-
-
-
js1 =
'''() =>{Object.defineProperties(navigator,{ webdriver:{ get: () => false}})}'''
-
js2 =
'''() => {alert(window.navigator.webdriver)}'''
-
js3 =
'''() => {window.navigator.chrome = {runtime: {}, }; }'''
-
js4 =
'''() =>{Object.defineProperty(navigator, 'languages', {get: () => ['en-US', 'en']});}'''
-
js5 =
'''() =>{Object.defineProperty(navigator, 'plugins', {get: () => [1, 2, 3, 4, 5,6],});}'''
-
-
-
async
def page_evaluate(page):
-
# 替换淘宝在检测浏览时采集的一些参数
-
# 需要注意,在测试的过程中发现登陆成功后页面的该属性又会变成True
-
# 所以在每次重新加载页面后要重新设置该属性的值。
-
await page.evaluate(
'''() =>{ Object.defineProperties(navigator,{ webdriver:{ get: () => false } }) }''')
-
await page.evaluate(
'''() =>{ window.navigator.chrome = { runtime: {}, }; }''')
-
await page.evaluate(
'''() =>{ Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); }''')
-
await page.evaluate(
'''() =>{ Object.defineProperty(navigator, 'plugins', { get: () => [1, 2, 3, 4, 5,6], }); }''')
-
-
-
async
def main():
-
browser =
await launch(
-
headless=
False,
-
# userDataDir='./userdata',
-
args=[
'--disable-infobars',
f'--window-size={width},{height}',
'--no-sandbox']
-
)
-
page =
await browser.newPage()
-
-
await page.setViewport(
-
{
-
"width": width,
-
"height": height
-
}
-
)
-
# url = 'https://www.taobao.com'
-
url =
'https://login.taobao.com/member/login.jhtml'
-
await page.goto(url=url)
-
-
await page.evaluate(js1)
-
await page.evaluate(js3)
-
await page.evaluate(js4)
-
await page.evaluate(js5)
-
-
# await page_evaluate(page)
-
-
await asyncio.sleep(
100)
-
# await browser.close()
-
-
asyncio.get_event_loop().run_until_complete(main())
方法 3:将 模拟浏览器 设置为 开发者模式
好像现在这种方法不好用了。。。。。。。。
示例代码:
-
chrome_options = Options()
-
-
# 制定 chrome.exe 路径名
-
# chrome_options.binary_location = f"{current_dir}\\chrome\\chrome.exe"
-
-
# 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
-
chrome_options.add_experimental_option(
'excludeSwitches', [
'enable-automation'])
-
-
# chrome_options.add_argument("--headless")
-
chrome_options.add_argument(
"disable-infobars")
-
chrome_options.add_argument(
"--no-sandbox")
-
chrome_options.add_argument(
f"--user-data-dir={current_dir}\\chrome\\userdatadir")
-
browser = webdriver.Chrome(
-
chrome_options=chrome_options,
-
executable_path=
f'{current_dir}\\chrome\\chromedriver.exe'
-
)