Selenium爬取实战

Selenium爬取实战

博客源地址:修能的博客

准备工作

  • 安装好了Chrome并且配置好了ChromeDriver
  • 安装好了Python3
  • 安装好了Selenium以及相关的包,并且保证正确打开Chrome

爬取目标

爬虫示例网站:https://spa2.scrape.center/

这个网站使用了Ajax渲染,但是仔细观察URL的构成可以发现,详情页和列表页的URL都变为了Base64的格式,这代表我们不能够通过构造URL来爬取各个页面的内容了,因为加密参数token每次都会变化。

所以这个时候使用selenium来模拟浏览器来绕过构造URL的过程,直接获取JavaScript渲染完成之后的页面源代码就可以了。

代码编写

爬取列表页

页面分析

列表页的URL没有进行加密,URL构造为:https://spa2.scrape.center/page/{page}

实现
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

import logging

# 设置日志输出格式
logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s : %(message)s')

INDEX_URL = 'https://spa2.scrape.center/page/{page}'    # 列表页的URL构造
TIME_OUT = 10                                           # 延时
TOTAL_PAGE = 10                                         # 总页数

browser = webdriver.Chrome()                            # 将浏览器初始化为Chorme浏览器
wait = WebDriverWait(browser,TIME_OUT)                  # 显示设置延时   


爬取列表页

页面

页面的详情页列表如图所示,所以用CSS选择器去选择。

实现
def scrape_page(url,condition,locator):
    """_summary_
    爬取的通用函数

    Args:
        url (_type_): 待爬取的页面URL
        condition (_type_): 页面加载成功的判断条件
        locator (_type_): 定位器,是一个元组,通过配置查询条件和参数来查找节点
    """

    logging.info('scraping %s',url)
    try:
        browser.get(url)
        wait.until(condition(locator))
    except TimeoutException:
        logging.error('error occured while scraping %s',url,exe_info=True)

def scrape_Index(page):
    """_summary_

    Args:
        page (_type_): 页面的秩
    """

    url = INDEX_URL.format(page) # 构造详情页的URL
    scrape_page(url=url,condition=EC.visibility_of_all_elements_located,locator=(By.CLASS_NAME,'#index .item'))

解析列表页

实现
def parse_index():
    """_summary_
    分割URL的加密参数
    Yields:
        str: 解析之后的URL
    """
    elements = browser.find_elements(by=By.CSS_SELECTOR,value='#index .item .name')
    for element in elements:
        href=element.get_attribute('href')
        yield urljoin(INDEX_URL,href)
        
def main():
    """
    爬取的主函数
    """
    try:
        for page in range(1,TOTAL_PAGE+1):
            scrape_Index(page)
            detail_urls = parse_index()
            logging.info('detail urls %s',list(detail_urls))
    finally:
        browser.close()

这个直接遍历了所有的页码,依次爬取了每一个列表页的URL。

2023-07-23 12:12:08,395 - INFO : scraping https://spa2.scrape.center/page/1
2023-07-23 12:12:10,579 - INFO : detail urls ['https://spa2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWIx', 'https://spa2.scrape.center/detail/ZWYzNCN0ZXVxMGJ0dWEjKC01N3cxcTVvNS0takA5OHh5Z2ltbHlmeHMqLSFpLTAtbWIy',
...
2023-07-23 12:12:20,003 - INFO : scraping https://spa2.scrape.center/page/10

可以看到爬取了那些加密的URL。

爬取详情页

基于同样的逻辑,在等待加载完之后,调用爬取详情页的函数就可以了。

def scrape_detail(url):
    """_summary_
    调用scrape_page()来爬取详情页的信息

    Args:
        url (str): 详情页的URL
    """
    # h2的节点是调用名称对应的节点
    scrape_page(url=url, condition=EC.visibility_of_all_elements_located,
                locator=(By.CSS_SELECTOR, 'h2'))


def parse_detail():
    """_summary_
    爬取详情页
    """
    url = browser.current_url
    name = browser.find_element(by=By.TAG_NAME, value='h2')
    categories = [element.text for element in browser.find_elements(
        by=By.CSS_SELECTOR, value='.categories button span')]
    cover = browser.find_element(
        by=By.CSS_SELECTOR, value='.cover').get_attribute('src')
    score = browser.find_element(by=By.CLASS_NAME, value='score').text
    drama = browser.find_element(by=By.CSS_SELECTOR, value='.drama p').text

    return {
        'url': url,
        'name': name,
        'categories': categories,
        'cover': cover,
        'score': score,
        'drama': drama
    }
    
改写main
def main():
    """
    爬取的主函数
    """
    try:
        for page in range(1, TOTAL_PAGE+1):
            scrape_Index(page)
            detail_urls = parse_index()
            # logging.info('detail urls %s', list(detail_urls))
            for detail_url in list(detail_urls):
                logging.info('get detail url %s', detail_url)
                scrape_detail(detail_url)
                detail_data = parse_detail()
                logging.info('detail data %s', detail_data)
    finally:
        browser.close()

存储到MongoDB

import pymongo

MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'Selenium'
MONGO_COLLECTION_NAME = 'movies'

client = pymongo.MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DB_NAME]
collection = db[MONGO_COLLECTION_NAME]

def save_data(data):
    """_summary_
    将JSON数据存储到MongoDB
    Args:
        data (dict): 爬取的数据
    """
    collection.update_one({'name': data.get('name')},
                          {'$set': data}, upsert=True)

改写main()

def main():
    """
    爬取的主函数
    """
    try:
        for page in range(1, TOTAL_PAGE+1):
            scrape_Index(page)
            detail_urls = parse_index()
            # logging.info('detail urls %s', list(detail_urls))
            for detail_url in list(detail_urls):
                logging.info('get detail url %s', detail_url)
                scrape_detail(detail_url)
                detail_data = parse_detail()
                save_data(detail_data)
                logging.info('detail data saved successful!')
    finally:
        browser.close()

完整代码

from urllib.parse import urljoin
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

import pymongo
import logging
# 设置日志输出格式
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s - %(levelname)s : %(message)s')

INDEX_URL = 'https://spa2.scrape.center/page/{page}'    # 列表页的URL构造
TIME_OUT = 10                                           # 延时
TOTAL_PAGE = 10                                         # 总页数

MONGO_CONNECTION_STRING = 'mongodb://localhost:27017'
MONGO_DB_NAME = 'Selenium'
MONGO_COLLECTION_NAME = 'movies'

client = pymongo.MongoClient(MONGO_CONNECTION_STRING)
db = client[MONGO_DB_NAME]
collection = db[MONGO_COLLECTION_NAME]

options = webdriver.ChromeOptions()
options.add_argument('--headless')

# 将浏览器初始化为Chorme浏览器
browser = webdriver.Chrome(options=options)
wait = WebDriverWait(browser, TIME_OUT)                  # 显示设置延时


def scrape_page(url, condition, locator):
    """_summary_
    爬取的通用函数

    Args:
        url (str): 待爬取的页面URL
        condition (bool): 页面加载成功的判断条件
                            一般是EC.visibility_of_all_elements_located。
                            EC.visibility_of_all_elements_located 是 Selenium 中的一个条件,用于等待页面上所有元素都可见。
                            或者是EC.visibility_of_elements_located
                            EC.visibility_of_elements_located 是 Selenium 中的一个条件,用于等待页面上至少一个元素可见。
        locator (tuple): 定位器,是一个元组,通过配置查询条件和参数来查找节点
    """

    logging.info('scraping %s', url)
    try:
        browser.get(url)  # 获取页面
        wait.until(condition(locator))  # 延时等待
    except TimeoutException:
        logging.error('error occured while scraping %s', url, exe_info=True)


def scrape_Index(page):
    """_summary_
    爬取详情页
    Args:
        page (int): 页面的秩
    """
    url = INDEX_URL.format(page=page)  # 构造详情页的URL
    scrape_page(url=url, condition=EC.visibility_of_all_elements_located,
                locator=(By.CSS_SELECTOR, '#index .item'))


def parse_index():
    """_summary_
    分割URL的加密参数
    Yields:
        str: 解析之后的URL
    """
    elements = browser.find_elements(
        by=By.CSS_SELECTOR, value='#index .item .name')
    for element in elements:
        href = element.get_attribute('href')
        yield urljoin(INDEX_URL, href)


def scrape_detail(url):
    """_summary_
    调用scrape_page()来爬取详情页的信息

    Args:
        url (str): 详情页的URL
    """
    # h2的节点是调用名称对应的节点
    scrape_page(url=url, condition=EC.visibility_of_all_elements_located,
                locator=(By.CSS_SELECTOR, 'h2'))


def parse_detail():
    """_summary_
    爬取详情页
    """
    url = browser.current_url
    name = browser.find_element(by=By.TAG_NAME, value='h2').text
    categories = [element.text for element in browser.find_elements(
        by=By.CSS_SELECTOR, value='.categories button span')]
    cover = browser.find_element(
        by=By.CSS_SELECTOR, value='.cover').get_attribute('src')
    score = browser.find_element(by=By.CLASS_NAME, value='score').text
    drama = browser.find_element(by=By.CSS_SELECTOR, value='.drama p').text

    return {
        'url': url,
        'name': name,
        'categories': categories,
        'cover': cover,
        'score': score,
        'drama': drama
    }


def save_data(data):
    """_summary_
    将JSON数据存储到MongoDB
    Args:
        data (dict): 爬取的数据
    """
    collection.update_one({'name': data.get('name')},
                          {'$set': data}, upsert=True)


def main():
    """
    爬取的主函数
    """
    try:
        for page in range(1, TOTAL_PAGE+1):
            scrape_Index(page)
            detail_urls = parse_index()
            # logging.info('detail urls %s', list(detail_urls))
            for detail_url in list(detail_urls):
                logging.info('get detail url %s', detail_url)
                scrape_detail(detail_url)
                detail_data = parse_detail()
                save_data(detail_data)
                logging.info('detail data saved successful!')
    finally:
        browser.close()


if __name__ == '__main__':
    main()
    print('seccessful!')

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值