python+selenium浏览器模拟信息爬取

Phil_xian

已于 2022-05-18 22:45:19 修改

阅读量570

点赞数 1

分类专栏：工作需求日志文章标签： python selenium 爬虫

于 2022-05-18 22:41:46 首次发布

本文链接：https://blog.csdn.net/phillip_xian/article/details/124852876

版权

工作需求日志专栏收录该内容

25 篇文章 0 订阅

订阅专栏

python+selenium浏览器模拟信息爬取

"""
-*- coding: utf-8 -*-
-- @author: phil
-- @Time:2022/5/18 18:45
"""
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException

class Demo_douban():
    def __init__(self):
        self.start_url = 'https://movie.douban.com/'
        self.driver = webdriver.Firefox()

    def SPT_one(self):
        # 进入电影主题模块
        self.driver.get(self.start_url)
        self.driver.find_element_by_xpath("//*[@class ='btn-next']").click()
        time.sleep(1)
        print('完成类似登录！')
        
    def wait(self, locator, timeout=5):
        '''等到元素加载完成'''
        WebDriverWait(self.driver, timeout).until(EC.presence_of_element_located(locator))

    def Super_get(self):
        # 解决：href="javascript:void(0);" 的页面加载问题
        # 相对路径定位：find_element_by_xpath("//form[@id='form']/span/input")
        path_a = "//div[@class='gaia gaia-lite gaia-movie slide-mode']/div[@class='list-wp']/div/div[2]/a[@class='btn-next']"
        locator = (By.XPATH, path_a)
        self.wait(locator)
        # 异常的捕获
        try:
            elements = self.driver.find_elements_by_xpath(path_a)  # 再次获取元素，预防StaleElementReferenceException
            self.driver.execute_script('arguments[0].click();', elements[0])  # 模拟用户点击
            print('已经点击！')
        except NoSuchElementException:
            print('NoSuchElementException')
        time.sleep(3)
        print(self.driver.current_url)
        
    def bs4_execute(self):
        # bs4的html解析
        url_a = 'https://movie.douban.com/subject/35250237/?tag=热门&from=gaia'
        ele = self.driver.find_element_by_xpath("//a[@href = '{}' ]".format(url_a))
        html = ele.get_attribute('innerHTML')
        print('html是：' + html)
        time.sleep(1)
        soup = BeautifulSoup(html, 'html5lib')      # 指定用html5lib来解析文档
        # 在查找的html中再找子标签的内容
        target = soup.find('p').get_text().replace('\n', '').split()       # target 是 str类型 .split('\n')
        # 列表
        print('电影名称：' + '\n' + '{}'.format(target[0]))
        # self.driver.quit()

# 运行命令
if __name__ == '__main__':
    SPT = Demo_CSDNweb()
    SPT.SPT_one()
    SPT.Super_get()
    SPT.bs4_execute()