Selenium Chrome Driver 爬虫实践

Selenium 常用配置信息

Chrome Driver 下载

Chromedriver所有版本下载链接:
https://chromedriver.storage.googleapis.com/index.html

注意版本对应关系,这里我用的是

版本 101.0.4951.41(正式版本) (64 位)

下载对应的Chrome Driver 版本即可,版本号对应即可;最后一位不一样,也可正常使用;

Selenium Chrome Options 常见配置

from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from selenium import webdriver

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('disable-infobars')#谷歌不出现在被自动化工具控制
chrome_options.add_argument('--headless') # 无头浏览器设置, 使用无头容易出错
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs) # 不加载图片设置
chrome_options.add_argument('User-Agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"') # 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_argument('log-level=3') 
chrome_options.add_argument("--start-maximized") # 最大化窗口
chrome_options.add_argument("--disable-cache")
chrome_options.add_argument("--no-sandbox"); 
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-browser-side-navigation")
chrome_options.add_argument("--disable-software-rasterizer")


chrome_options.binary_location=r"~/yourpath/chrome.exe"
desired_capabilities = copy.deepcopy(DesiredCapabilities.CHROME)
desired_capabilities["pageLoadStrategy"] = "none"
# chrome_options.add_argument(r'--user-data-dir=C:\Users\ADMINI~1\AppData\Local\Temp\scoped_dir11488_22853\Default') # 谷歌有时候大姨妈 需要加上这个 将其中的路径 设置成用户自己的数据目录
browser = webdriver.Chrome(
    chrome_options=chrome_options,
    executable_path=r"~/yourpath/chromedriver.exe",
    desired_capabilities = desired_capabilities
)

简答的反爬虫策略

User Agent 配置

user_agent_list = [ \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]

ua = random.choice(user_agent_list)
if ua:
    browser.execute_cdp_cmd("Emulation.setUserAgentOverride", {"userAgent": ua})

隐藏浏览器特征

其中stealth.min.js 文件可自行下载;

remove_webdriver_feature_js = os.path.join( os.path.dirname(__file__), 'stealth.min.js')
with open(remove_webdriver_feature_js) as fid:
    js = fid.read()
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
    "source": js
})

代理配置

对于有批量下载需求的,可以使用相应的代理服务进行下载配置

比价网站慢慢买爬虫实践(完整代码,仅供实践学习使用,爬虫请遵守相应法律法规)

import os
import pdb
import copy
import numpy as np
import random

import scrapy
import copy
from scrapy.http import HtmlResponse
from scrapy.linkextractors import LinkExtractor
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
from scrapy import Selector
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib import parse
import pdb

import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import  WebDriverWait
from selenium.webdriver.support import  expected_conditions as EC
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities

from lxml import etree
import re,time,requests,json
import numpy as np
from requests.exceptions import  RequestException

chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('disable-infobars')#谷歌不出现在被自动化工具控制
# chrome_options.add_argument('--headless') # 无头浏览器设置, 使用无头容易出错
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs) # 不加载图片设置
chrome_options.add_argument('User-Agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"') # 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--disable-gpu') # 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
chrome_options.add_experimental_option("useAutomationExtension", False)
chrome_options.add_argument('log-level=3')
chrome_options.add_argument("--start-maximized");
chrome_options.add_argument("--disable-cache")
chrome_options.add_argument("--no-sandbox"); 
chrome_options.add_argument("--disable-infobars")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-browser-side-navigation")
chrome_options.add_argument("--disable-software-rasterizer")

desired_capabilities = copy.deepcopy(DesiredCapabilities.CHROME)
desired_capabilities["pageLoadStrategy"] = "none"
# chrome_options.add_argument(r'--user-data-dir=C:\Users\ADMINI~1\AppData\Local\Temp\scoped_dir11488_22853\Default') # 谷歌有时候大姨妈 需要加上这个 将其中的路径 设置成用户自己的数据目录
browser = webdriver.Chrome(
    chrome_options=chrome_options,
    executable_path=os.path.join(os.path.dirname(__file__),'chromedriver.exe'),
    desired_capabilities = desired_capabilities
)
remove_webdriver_feature_js = os.path.join( os.path.dirname(__file__), 'stealth.min.js')
with open(remove_webdriver_feature_js) as fid:
    js = fid.read()
browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
    "source": js
})
wait = WebDriverWait(browser,10) # 等待网页反应最多10秒
user_agent_list = [ \
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 "
        "(KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 "
        "(KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 "
        "(KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 "
        "(KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 "
        "(KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 "
        "(KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]

def scroll_down(): #下滑操作
    sroll_cnt = 0
    while  True:
        if (sroll_cnt) < 6:
                browser.execute_script('window.scrollBy(0, 2000)')
                time.sleep(0.5*np.random.rand())
                sroll_cnt += 1
        else:
            break

def parse_page(subclass_url='http://www.manmanbuy.com/list_6.aspx',snap_filepath='tmp.json'):
    browser.get(subclass_url)
    with open(snap_filepath, 'w', encoding='utf-8') as fid:
        failure_cnt = 0;
        while True:
            scroll_down()
            try:
                wait.until(EC.presence_of_all_elements_located((By.ID,"dispage")))
            except:
                browser.delete_all_cookies()
                failure_cnt += 1
                continue
            if failure_cnt > 5:
                break;
            failure_cnt = 0
            # items =  browser.find_elements_by_class_name('item')
            # get all items
            content = BeautifulSoup(browser.page_source, 'lxml')
            cmpsku_items = content.find_all(attrs={'class': 'item'})
            num_pattern = re.compile(r'\d+')
            for item in cmpsku_items:
                platform_num = num_pattern.search(item.find('div', attrs={'class':'btns'}).a.text).group()
                url = "http://www.manmanbuy.com/" + item.find('div', attrs={'class':'pic'}).a['href']
                pic_url = item.find('div', attrs={'class':'pic'}).a.img['original']
                title = item.find('div', attrs={'class':'name'}).a.text
                
                tmp_dict = {}
                tmp_dict['platform_num'] = platform_num
                tmp_dict['url'] = url
                tmp_dict['pic_url'] = pic_url
                tmp_dict['title'] = title
                fid.write('{}\n'.format(json.dumps(tmp_dict, ensure_ascii=False)))
            page_current_total_pattern = re.compile(r'(?P<current_page>\d+)/(?P<total_page>\d+)')
            disp_item = content.find(attrs={'id': 'dispage'})
            next_seqnum = 0
            page_items = disp_item.find_all('a')
            for cnt, page_item in enumerate(page_items):
                if page_item.text ==  '下一页':
                    next_seqnum = cnt
                    next_url = 'http://www.manmanbuy.com/'+'{}'.format(page_item['href'])
                else:
                    continue
            page_status = page_current_total_pattern.search(disp_item.text)
            if int(page_status['current_page']) % 10 == 0:
                fid.flush()
            if int(page_status['current_page']) < int(page_status['total_page']):
                print("第",page_status['current_page'],"页数据爬取完毕,共",page_status['total_page'],"页")
                # browser.find_element_by_xpath('//div[@id="dispage"]/a').click()
                browser.get(next_url)
                time.sleep(3*np.random.rand())
            else:
                break
def parse_detail_urls(json_path=r'洗衣机.json'):
    result_json_path = os.path.splitext(json_path)[0] + "_lst.json"
    with open(json_path, 'r', encoding='utf-8') as fid:
        json_lines = fid.readlines()
    with open(result_json_path, 'w', encoding='utf-8') as fid:
        for cnt, line in enumerate(json_lines):
            if cnt % 20 == 0:
                ua = random.choice(user_agent_list)
                if ua:
                    browser.execute_cdp_cmd("Emulation.setUserAgentOverride", {"userAgent": ua})
                browser.delete_all_cookies()
            json_info = json.loads(line)
            browser.get(json_info['url'])
            time.sleep(3)
            try:
                scroll_down()
                content = BeautifulSoup(browser.page_source, 'lxml')
                pro_mall_nodes = content.find_all('div', attrs={'class': 'pro-mall-list'})
                if pro_mall_nodes is None or len(pro_mall_nodes) <= 0:
                    continue
                pro_mall_node = pro_mall_nodes[0]
                pro_mall_list = pro_mall_node.find_all('li')
                
                result_mall_pro_list = []
                for pro_mall in pro_mall_list:
                    if pro_mall.div is None:
                        continue
                    pro_dict = {}
                    pro_head_info = eval(pro_mall.div['v'])
                    if pro_head_info is None:
                        continue
                    pro_dict['sitename'] = pro_head_info['sitename']
                    pro_dict['skuid'] = pro_mall.div['skuid']
                    redirect_url = pro_mall.find('a')['href']
                    if not redirect_url.startswith('http'):
                        pro_dict['redirect_url'] = 'http://www.manmanbuy.com/' + redirect_url
                    else:
                        pro_dict['redirect_url'] = redirect_url
                    result_mall_pro_list.append(pro_dict)
                json_info['mall_list'] = result_mall_pro_list
                fid.write('{}\n'.format(json.dumps(json_info, ensure_ascii=False)))
                fid.flush()
            except Exception as err:
                print('Error happends: {}'.format(err))
            fid.flush()
def get_start_urls():
    browser.get('http://home.manmanbuy.com/bijia.aspx')
    time.sleep(6)
    html = browser.page_source
    response = HtmlResponse(url=browser.current_url, body=html, encoding='utf-8')
    pattern = "http://www\.manmanbuy\.com/list_.*\.aspx"
    link_extractor = LinkExtractor(allow=pattern)
    links = link_extractor.extract_links(response)
    detail_urls = {i.text:i.url for i in links}
    return detail_urls
if __name__ == '__main__':
    detail_urls = get_start_urls()
    pdb.set_trace()
    for typeflag, url in detail_urls.items():
        parse_page(subclass_url=url, snap_filepath='{}.json'.format(typeflag))
    # for _json_path in [ '洗衣机.json']:
        # parse_detail_urls(json_path=_json_path)

  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Mira-Tableau

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值