Selenium 常用配置信息
Chrome Driver 下载
Chromedriver所有版本下载链接:
https://chromedriver.storage.googleapis.com/index.html
注意版本对应关系,这里我用的是
版本 101.0.4951.41(正式版本) (64 位)
下载对应的Chrome Driver 版本即可,版本号对应即可;最后一位不一样,也可正常使用;
Selenium Chrome Options 常见配置
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium import webdriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('disable-infobars')#谷歌不出现在被自动化工具控制
chrome_options.add_argument('--headless') # 无头浏览器设置, 使用无头容易出错
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs) # 不加载图片设置
chrome_options.add_argument('User-Agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"') # 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_argument('log-level=3')
chrome_options.add_argument("--start-maximized") # 最大化窗口
chrome_options.add_argument("--disable-cache")
chrome_options.add_argument("--no-sandbox");
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-browser-side-navigation")
chrome_options.add_argument("--disable-software-rasterizer")
chrome_options.binary_location=r"~/yourpath/chrome.exe"
desired_capabilities = copy.deepcopy(DesiredCapabilities.CHROME)
desired_capabilities["pageLoadStrategy"] = "none"
# chrome_options.add_argument(r'--user-data-dir=C:\Users\ADMINI~1\AppData\Local\Temp\scoped_dir11488_22853\Default') # 谷歌有时候大姨妈 需要加上这个 将其中的路径 设置成用户自己的数据目录
browser = webdriver.Chrome(
chrome_options=chrome_options,
executable_path=r"~/yourpath/chromedriver.exe",
desired_capabilities = desired_capabilities
)
简答的反爬虫策略
User Agent 配置
user_agent_list = [ \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
ua = random.choice(user_agent_list)
if ua:
browser.execute_cdp_cmd("Emulation.setUserAgentOverride", {"userAgent": ua})
隐藏浏览器特征
其中stealth.min.js 文件可自行下载;
remove_webdriver_feature_js = os.path.join( os.path.dirname(__file__), 'stealth.min.js')
with open(remove_webdriver_feature_js) as fid:
js = fid.read()
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": js
})
代理配置
对于有批量下载需求的,可以使用相应的代理服务进行下载配置
比价网站慢慢买爬虫实践(完整代码,仅供实践学习使用,爬虫请遵守相应法律法规)
import os
import pdb
import copy
import numpy as np
import random
import scrapy
import copy
from scrapy.http import HtmlResponse
from scrapy.linkextractors import LinkExtractor
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from scrapy import Selector
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib import parse
import pdb
import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from lxml import etree
import re,time,requests,json
import numpy as np
from requests.exceptions import RequestException
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('disable-infobars')#谷歌不出现在被自动化工具控制
# chrome_options.add_argument('--headless') # 无头浏览器设置, 使用无头容易出错
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs) # 不加载图片设置
chrome_options.add_argument('User-Agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"') # 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_argument('log-level=3')
chrome_options.add_argument("--start-maximized");
chrome_options.add_argument("--disable-cache")
chrome_options.add_argument("--no-sandbox");
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-browser-side-navigation")
chrome_options.add_argument("--disable-software-rasterizer")
desired_capabilities = copy.deepcopy(DesiredCapabilities.CHROME)
desired_capabilities["pageLoadStrategy"] = "none"
# chrome_options.add_argument(r'--user-data-dir=C:\Users\ADMINI~1\AppData\Local\Temp\scoped_dir11488_22853\Default') # 谷歌有时候大姨妈 需要加上这个 将其中的路径 设置成用户自己的数据目录
browser = webdriver.Chrome(
chrome_options=chrome_options,
executable_path=os.path.join(os.path.dirname(__file__),'chromedriver.exe'),
desired_capabilities = desired_capabilities
)
remove_webdriver_feature_js = os.path.join( os.path.dirname(__file__), 'stealth.min.js')
with open(remove_webdriver_feature_js) as fid:
js = fid.read()
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": js
})
wait = WebDriverWait(browser,10) # 等待网页反应最多10秒
user_agent_list = [ \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
"(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
"(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
"(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
"(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
"(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
"(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
def scroll_down(): #下滑操作
sroll_cnt = 0
while True:
if (sroll_cnt) < 6:
browser.execute_script('window.scrollBy(0, 2000)')
time.sleep(0.5*np.random.rand())
sroll_cnt += 1
else:
break
def parse_page(subclass_url='http://www.manmanbuy.com/list_6.aspx',snap_filepath='tmp.json'):
browser.get(subclass_url)
with open(snap_filepath, 'w', encoding='utf-8') as fid:
failure_cnt = 0;
while True:
scroll_down()
try:
wait.until(EC.presence_of_all_elements_located((By.ID,"dispage")))
except:
browser.delete_all_cookies()
failure_cnt += 1
continue
if failure_cnt > 5:
break;
failure_cnt = 0
# items = browser.find_elements_by_class_name('item')
# get all items
content = BeautifulSoup(browser.page_source, 'lxml')
cmpsku_items = content.find_all(attrs={'class': 'item'})
num_pattern = re.compile(r'\d+')
for item in cmpsku_items:
platform_num = num_pattern.search(item.find('div', attrs={'class':'btns'}).a.text).group()
url = "http://www.manmanbuy.com/" + item.find('div', attrs={'class':'pic'}).a['href']
pic_url = item.find('div', attrs={'class':'pic'}).a.img['original']
title = item.find('div', attrs={'class':'name'}).a.text
tmp_dict = {}
tmp_dict['platform_num'] = platform_num
tmp_dict['url'] = url
tmp_dict['pic_url'] = pic_url
tmp_dict['title'] = title
fid.write('{}\n'.format(json.dumps(tmp_dict, ensure_ascii=False)))
page_current_total_pattern = re.compile(r'(?P<current_page>\d+)/(?P<total_page>\d+)')
disp_item = content.find(attrs={'id': 'dispage'})
next_seqnum = 0
page_items = disp_item.find_all('a')
for cnt, page_item in enumerate(page_items):
if page_item.text == '下一页':
next_seqnum = cnt
next_url = 'http://www.manmanbuy.com/'+'{}'.format(page_item['href'])
else:
continue
page_status = page_current_total_pattern.search(disp_item.text)
if int(page_status['current_page']) % 10 == 0:
fid.flush()
if int(page_status['current_page']) < int(page_status['total_page']):
print("第",page_status['current_page'],"页数据爬取完毕,共",page_status['total_page'],"页")
# browser.find_element_by_xpath('//div[@id="dispage"]/a').click()
browser.get(next_url)
time.sleep(3*np.random.rand())
else:
break
def parse_detail_urls(json_path=r'洗衣机.json'):
result_json_path = os.path.splitext(json_path)[0] + "_lst.json"
with open(json_path, 'r', encoding='utf-8') as fid:
json_lines = fid.readlines()
with open(result_json_path, 'w', encoding='utf-8') as fid:
for cnt, line in enumerate(json_lines):
if cnt % 20 == 0:
ua = random.choice(user_agent_list)
if ua:
browser.execute_cdp_cmd("Emulation.setUserAgentOverride", {"userAgent": ua})
browser.delete_all_cookies()
json_info = json.loads(line)
browser.get(json_info['url'])
time.sleep(3)
try:
scroll_down()
content = BeautifulSoup(browser.page_source, 'lxml')
pro_mall_nodes = content.find_all('div', attrs={'class': 'pro-mall-list'})
if pro_mall_nodes is None or len(pro_mall_nodes) <= 0:
continue
pro_mall_node = pro_mall_nodes[0]
pro_mall_list = pro_mall_node.find_all('li')
result_mall_pro_list = []
for pro_mall in pro_mall_list:
if pro_mall.div is None:
continue
pro_dict = {}
pro_head_info = eval(pro_mall.div['v'])
if pro_head_info is None:
continue
pro_dict['sitename'] = pro_head_info['sitename']
pro_dict['skuid'] = pro_mall.div['skuid']
redirect_url = pro_mall.find('a')['href']
if not redirect_url.startswith('http'):
pro_dict['redirect_url'] = 'http://www.manmanbuy.com/' + redirect_url
else:
pro_dict['redirect_url'] = redirect_url
result_mall_pro_list.append(pro_dict)
json_info['mall_list'] = result_mall_pro_list
fid.write('{}\n'.format(json.dumps(json_info, ensure_ascii=False)))
fid.flush()
except Exception as err:
print('Error happends: {}'.format(err))
fid.flush()
def get_start_urls():
browser.get('http://home.manmanbuy.com/bijia.aspx')
time.sleep(6)
html = browser.page_source
response = HtmlResponse(url=browser.current_url, body=html, encoding='utf-8')
pattern = "http://www\.manmanbuy\.com/list_.*\.aspx"
link_extractor = LinkExtractor(allow=pattern)
links = link_extractor.extract_links(response)
detail_urls = {i.text:i.url for i in links}
return detail_urls
if __name__ == '__main__':
detail_urls = get_start_urls()
pdb.set_trace()
for typeflag, url in detail_urls.items():
parse_page(subclass_url=url, snap_filepath='{}.json'.format(typeflag))
# for _json_path in [ '洗衣机.json']:
# parse_detail_urls(json_path=_json_path)