Selenium自动化爬取（可视皆可爬）

最新推荐文章于 2024-05-10 16:25:53 发布

小書

最新推荐文章于 2024-05-10 16:25:53 发布

阅读量657

点赞数 1

分类专栏： python-网络爬虫

本文链接：https://blog.csdn.net/weixin_46417042/article/details/117826484

版权

python-网络爬虫专栏收录该内容

6 篇文章 0 订阅

订阅专栏

Selenium自动化爬取

1 - 基础自动化爬取数据

from selenium import webdriver
import re

# 初始化
browser = webdriver.Chrome()
# 爬取的网址
browser.get("https://movie.douban.com/subject/34973399/comments?status=P")
# 获取网页源代码
html_source = browser.page_source
# 清洗数据
comments = re.findall('<span class="short">(.*?)</span>',html_source,re.S)

2 - 获取所有页的全部评论（自动翻页）

from selenium import webdriver
import re
from lxml import etree
import time

# 初始化
browser = webdriver.Chrome()
# 爬取的网址
browser.get("https://movie.douban.com/subject/34973399/comments?status=P")
#存储数据的容器
containers = []
#自动翻页并且获取数据
for i in range(100):
    try:
        #获取网页源代码           
        html_source = browser.page_source
        #存储数据
        containers.append(re.findall('<span class="short">(.*?)</span>',html_source,re.S))    
        # 休眠3s (防止出现访问频繁现象)
        time.sleep(3)
        # 模拟页面点击，点击下一页
        browser.find_element_by_class_name("next").click()        
    except:
        break

3 - 高级自动化爬取数据（突破驱动浏览器限制）

from selenium import webdriver
import re
from lxml import etree
import time

#构造参数
options = webdriver.ChromeOptions()
#使用chrome开发者模式
options.add_experimental_option('excludeSwitches', ['enable-automation'])
#禁用启用Blink运行时的功能
options.add_argument("--disable-blink-features=AutomationControlled")

#初始化
browser = webdriver.Chrome(options=options)
#爬取的网址
browser.get("https://www.toutiao.com/c/user/token/MS4wLjABAAAALw-mNjh665zUqd8HgyWgHezdVR17TKyS_bPsZj6uFj-Ru5OY7g-CayNSKY-FJDPH/?wid=1622807436880")
#休眠30s
time.sleep(30)

#获取网页源代码
html_source = browser.page_source
#清洗数据
tree = etree.HTML(html_source)
info = tree.xpath('//div[@class="ugc-content"]')
#存储数据
container = []
[container.append(i.xpath("string(.)")) for i in info]

4 - 扩展-模拟操作（输入值和点击）

from selenium import webdriver
#负责按键
from selenium.webdriver.common.keys import Keys
#负责根据id或者class等定位目标节点
from selenium.webdriver.common.by import By
#负责获取预期的节点
from selenium.webdriver.support import expected_conditions as ec
#负责等待元素出现
from selenium.webdriver.support.wait import WebDriverWait

# 初始化
browser = webdriver.Chrome()
# 爬取的网址
browser.get("https://www.baidu.com/")
# 显示等待（还有其他等待模式，比如隐式等待implicitly_wait,强制等待sleep）
wait = WebDriverWait(browser,10)
wait.until(ec.presence_of_element_located((By.ID,"kw"))).send_keys("python爬虫")
#寻找含有该文本的链接节点,a标签
browser.find_element_by_link_text("贴吧").click()

5 - 扩展-元素定位（定位节点及获取文本或属性）

from selenium import webdriver

# 初始化
browser = webdriver.Chrome()
# 爬取的网址
browser.get("http://news.baidu.com/guonei")
#只能获取首条数据
data_one = browser.find_element_by_xpath("//*[@class = 'ulist mix-ulist']//li")
#可获取多条数据
data_ones = browser.find_elements_by_xpath("//*[@class = 'ulist mix-ulist']//li")
for i in data_ones:
    #获取文本信息
    print(i.text)
    #获取节点属性
    print(i.find_element_by_xpath("./a").get_attribute("mon"))

6 - 扩展-三种等待模式（强制,隐式,显式等待）

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as ec
from selenium.webdriver.common.by import By
import time


#强制等待
def force_wait():   
    url = "http://news.baidu.com/guoji"
    browser = webdriver.Chrome()
    browser.get(url)
    time.sleep(3)
    all_tag = browser.find_elements_by_xpath("//div[@id = 'instant-news']//li")
    for i in all_tag:
        #获取文本内容
        print(i.text)
        #获取属性值
        print(i.find_element_by_xpath("./a").get_attribute("class"))
    print("强制等待")

#隐式等待
def hidden_wait():   
    url = "http://news.baidu.com/guoji"
    browser = webdriver.Chrome()
    browser.get(url)
    browser.implicitly_wait(3)
    all_tag = browser.find_elements_by_xpath("//div[@id = 'instant-news']//li")
    for i in all_tag:
        #获取文本内容
        print(i.text)
        #获取属性值
        print(i.find_element_by_xpath("./a").get_attribute("class"))
    print("隐式等待")

#显式等待获取指定元素
def display_wait():   
    url = "http://news.baidu.com/guoji"
    browser = webdriver.Chrome()
    browser.get(url)
    wait = WebDriverWait(browser,3)
    all_tag = wait.until(ec.presence_of_all_elements_located((By.XPATH,"//div[@id = 'instant-news']//li")))
    for i in all_tag:
        #获取文本内容
        print(i.text)
        #获取属性值
        print(i.find_element_by_xpath("./a").get_attribute("class"))
    print("显式等待")
        
        
if __name__ == "__main__":
    force_wait()
    hidden_wait()
    display_wait()

7 - 扩展-动作链的应用（移动到目标节点）

#动作链的应用
url = "https://www.runoob.com/try/try.php?filename=tryhtml5_draganddrop"
browser = webdriver.Chrome()
browser.get(url)
act = ActionChains(browser)
#移动到某个节点上
#获取目标节点位置
tag_to = browser.find_element_by_xpath('/html/body/div[1]/div/div[2]/div/div[1]/form/button')
#制定动作
act.move_to_element(tag_to)
#执行动作
act.perform()

8 -扩展- frame/iframe内嵌页面（操控frame里面的节点）

#frame/iframe内嵌页面上节点操控
#在该页面中存在ifame节点所以这里需要先切换到iframe,才能操控里面的节点
#switch_to.frame() 默认可以直接取表单的id 或name属性
browser.switch_to.frame('iframeResult')
#从frame跳回外层页面
browser.switch_to.default_content()

9 -扩展-参数对象（代理方式访问）

代码

from selenium import webdriver

#创建参数对象
opts = webdriver.ChromeOptions()
#添加参数（代理IP方式）
opts.add_argument('--proxy-server= http://222.37.211.162:46603')
#初始化驱动对象，并加载参数
browser = webdriver.Chrome(options= opts)
#访问该网址，查看是否使用代理ip成功
browser.get("https://www.baidu.com/s?wd=ip")

常用参数

opts.add_argument('--user-agent= xxxx...')  #设置请求头的User-Agent
opts.add_argument('--proxy-server= http://222.37.211.162:46603') #设置代理IP的方式
opts.add_argument('--blink-settings=imagesEnabled=false')  #不用加载图片
opts.add_argument('--headless')  # 浏览器不提供可视化页面
opts.add_argument('--window-size=1920x1280')  # 设置浏览器分辨率（窗口大小）

小書

关注

1
点赞
踩
6

收藏

觉得还不错? 一键收藏
0
评论
Selenium自动化爬取（可视皆可爬）

Selenium自动化爬取1 - 基础自动化爬取数据from selenium import webdriverimport re# 初始化browser = webdriver.Chrome()# 爬取的网址browser.get("https://movie.douban.com/subject/34973399/comments?status=P")# 获取网页源代码html_source = browser.page_source# 清洗数据comments = re.fi
复制链接

扫一扫