Notion文章简单下载【爬虫单个/批量】——Python+selenium

小白要努力变黑

于 2024-03-20 17:58:31 发布

阅读量459

点赞数 4

文章标签： python notion selenium

本文链接：https://blog.csdn.net/m0_51315555/article/details/136883390

版权

Notion文章样式——列表，文章

Notion文章下载——保存为txt,只要文字部分

Notion文章下载——保存html、音频

Notion文章样式——列表，文章

Notion文章下载——保存为txt,只要文字部分

技术注意：

        selenium版本需要4以上

功能：

文章下载：txt

单个下载

批量下载（使用线程池）



失败链接重启：

下载失败的文章，链接会保存在errors.txt文件中，

读取后，重新下载，直到文件中没有失败链接。



校验：

查看文章是否下载完整，本地文章名称对比网页列表名称。



使用注意：

URL、SRC需要修改为自己的链接和目录。

不一样的文章元素截取位置可能不同，BeautifulSoup中的find/findALL截取内容需要看网页情况进行调整。

文章的名称和时间格式可能不同。

                文章标题有过滤不能下载的特殊符符号方法sanitize_filename()。

                时间格式可处理形式 April 1, 2023||2023/4/1两种，其他形式需要自行修改。

"""
保存notion文章
    只保存文本内容——.txt
    提高效率——线程池批量保存notion文章
    保存失败——保存失败文章的链接重新下载
    最后校验——本地已下载文章名称对比网页文章的名称,是否已下载完整

pip install selenium 版本是4以上，不然无法直接使用Options
"""
import os

import requests
from selenium import webdriver
# 在这里导入浏览器设置相关的类
from selenium.webdriver.edge.options import Options

from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool

from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

# 需要批量下载的notion文章链接
from selenium.webdriver.support.wait import WebDriverWait

# 需要下载文章列表链接
URL = "https://rabbit-hole.notion.site/35357ab38a0042bcb5e0bfd762d4d495?v=ba4854c9878f4ae68e84d12634939daa"
# 保存到本地目录
SRC = "C:\\Users\\23938\\Desktop\\txt\\"


# 获取页面所有文章链接
def get_allUrl(url):
    # 无界面
    # 反检测设置
    edge_options = Options()
    edge_options.add_argument('--headless')
    # 设置不加载图片策略
    prefs = {"profile.managed_default_content_settings.images": 2}
    edge_options.add_experimental_option("prefs", prefs)
    # # 开启开发者模式
    # edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # # 禁用启用Blink运行时的功能
    # edge_options.add_argument('--disable-blink-features=AutomationControlled')
    # 将参数传给浏览器
    driver = webdriver.Edge(options=edge_options)

    driver.get(url)

    # 等待页面加载完成——为了提高成功率[显示等待-最长超时时间10s,这里我设置当图片元素img加载出来时，则认为页面加载完毕。网络较慢可以适当加超时时间]
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.TAG_NAME, 'img')))

    html_content = driver.page_source  # 渲染完后，获取HTML内容

    # # 打开HTML文件并读取内容
    # with open('index.html', 'r', encoding='utf-8') as file:
    #     html_content = file.read()

    # 退出浏览器
    driver.quit()

    # 使用BeautifulSoup解析HTML内容
    soup = BeautifulSoup(html_content, 'html.parser')

    # 提取页面中，所有笔记的链接
    # 获取所有<a>标签的href属性值集合
    all_a = [a['href'] for a in
             soup.findAll("a", attrs={"rel": "noopener noreferrer"})]  # 找到所有rel=noopener noreferrer的a标签

    print("链接获取成功")

    return all_a


# 下载文章方法
def get_html_data(url):
    # 无界面
    # 反检测设置
    edge_options = Options()
    edge_options.add_argument('--headless')
    # 设置不加载图片策略
    prefs = {"profile.managed_default_content_settings.images": 2}
    edge_options.add_experimental_option("prefs", prefs)
    # # 开启开发者模式
    # edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # # 禁用启用Blink运行时的功能
    # edge_options.add_argument('--disable-blink-features=AutomationControlled')
    # 将参数传给浏览器
    driver = webdriver.Edge(options=edge_options)

    # 发起请求
    driver.get(url)

    try:

        # 等待页面加载完成——为了提高成功率[显示等待-最长超时时间10s,这里我设置当图片元素img加载出来时，则认为页面加载完毕。网络较慢可以适当加超时时间]
        WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.TAG_NAME, 'img')))

        html_content = driver.page_source  # 渲染完后，获取HTML内容

        # 获取html成功后，关闭浏览器
        driver.quit()

        # 使用BeautifulSoup 解析页面
        soup = BeautifulSoup(html_content, 'html.parser')

        # 获取文章标题
        title = soup.title.string
        # 过滤掉特殊符号
        title_new = sanitize_filename(title)

        # 获取文章时间(查找div中style样式是line-height: 1.5; word-break: break-word; white-space: pre-wrap; display: inline;)
        timeDiv = soup.find("div", attrs={
            "style": "line-height: 1.5; word-break: break-word; white-space: pre-wrap; display: inline;"})
        timeText = timeDiv.get_text()  # 只要div的文本内容
        # 转换为我喜欢的日期格式 April 1, 2023 ==> 2023_4_1
        if "," in timeText:
            timeRep = timeText.replace(",", "")  # 去除 ,号
            words = timeRep.split()  # 按空格进行分隔
            month_num = month_to_number(words[0])  # 英文月份匹配数字
            timeText = words[2] + '_' + str(month_num) + '_' + words[1]
        elif "/" in timeText:
            timeText = timeText.replace("/", '_')  # 2023/4/1 ==> 2023_4_1

        # 获取文章内容--------------
        text_Div = soup.findAll("div", attrs={"placeholder": " "})
        if text_Div:  # 如果内容不为空
            # 保存本地
            with open(SRC + timeText + '_' + title_new + ".txt", 'w', encoding='utf-8') as file:
                # 标题
                file.write('《' + title + '》\n')
                # div标签遍历获取文本内容
                for text in text_Div:
                    file.write("\t" + text.get_text() + '\n')
                file.write('\n\n')
        else:
            print("没有获取到指定内容，文章加载失败：" + url)
            # 获取失败链接信息保存到txt
            with open('../errors.txt', 'a') as f:
                f.write(f'{url}\n')

        print("文章" + title + "下载完成")

    except Exception as e:
        print("下载异常的文章:" + title + url + "\n" + e)
        # 获取失败链接信息保存到txt
        with open('../errors.txt', 'a') as f:
            f.write(f'{url}\n')


# 合并保存文章方法
def get_html_data_one(url):
    # 无界面
    # 反检测设置
    edge_options = Options()
    edge_options.add_argument('--headless')
    # 设置不加载图片策略
    prefs = {"profile.managed_default_content_settings.images": 2}
    edge_options.add_experimental_option("prefs", prefs)
    # # 开启开发者模式
    # edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # # 禁用启用Blink运行时的功能
    # edge_options.add_argument('--disable-blink-features=AutomationControlled')
    # 将参数传给浏览器
    driver = webdriver.Edge(options=edge_options)

    # 发起请求
    driver.get(url)

    try:

        # 等待页面加载完成——为了提高成功率[显示等待-最长超时时间10s,这里我设置当图片元素img加载出来时，则认为页面加载完毕。网络较慢可以适当加超时时间]
        WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.TAG_NAME, 'img')))

        html_content = driver.page_source  # 渲染完后，获取HTML内容

        # 获取html成功后，关闭浏览器
        driver.quit()

        # 使用BeautifulSoup 解析页面
        soup = BeautifulSoup(html_content, 'html.parser')

        # 获取文章标题
        title = soup.title.string
        # # 过滤掉特殊符号
        title_new = sanitize_filename(title)
        print(title_new)
        # 获取文章时间(查找div中style样式是line-height: 1.5; word-break: break-word; white-space: pre-wrap; display: inline;)
        timeDiv = soup.find("div", attrs={
            "style": "line-height: 1.5; word-break: break-word; white-space: pre-wrap; display: inline;"})
        timeText = timeDiv.get_text()  # 只要div的文本内容
        print(timeText)
        # 转换为我喜欢的日期格式 April 1, 2023 ==> 2023_4_1
        if "," in timeText:
            timeRep = timeText.replace(",", "")  # 去除 ,号
            words = timeRep.split()  # 按空格进行分隔
            month_num = month_to_number(words[0])  # 英文月份匹配数字
            timeText = words[2] + '_' + str(month_num) + '_' + words[1]
        elif "/" in timeText:
            timeText = timeText.replace("/", '_')  # 2023/4/1 ==> 2023_4_1

        # 获取文章内容--------------
        text_Div = soup.findAll("div", attrs={"placeholder": " "})
        if text_Div:  # 如果内容不为空
            # 保存本地
            with open(SRC + "all.txt", 'a', encoding='utf-8') as file:
                # 标题
                file.write('《' + timeText + '_' + title_new + '》\n')
                # div标签遍历获取文本内容
                for text in text_Div:
                    file.write("\t" + text.get_text() + '\n')
                file.write('\n\n')
        else:
            print("没有获取到指定内容，文章加载失败：" + url)
            # 获取失败链接信息保存到txt
            with open('../errors.txt', 'a') as f:
                f.write(f'{url}\n')

        print("文章" + title + "加入成功")

    except Exception as e:
        print("加入异常的文章:" + title + url)
        print(e)
        # 获取失败链接信息保存到txt
        with open('../errors.txt', 'a') as f:
            f.write(f'{url}\n')


# 过滤掉名称符号
def sanitize_filename(filename):
    # 替换掉回车
    filename = filename.replace('\n', '')
    # Windows名称不能创建的符号
    illegal_chars = '<>:"/\|?*'
    for char in illegal_chars:
        filename = filename.replace(char, '')
    return filename


# 月份匹配
def month_to_number(month_name):
    months = {
        'January': 1, 'February': 2, 'March': 3,
        'April': 4, 'May': 5, 'June': 6,
        'July': 7, 'August': 8, 'September': 9,
        'October': 10, 'November': 11, 'December': 12
    }
    return months.get(month_name)


# 线程池(线程池最大数，需要使用线程的方法，传给方法的参数)
def use_pool(max, async_function, numbers):
    pool = Pool(max)
    pool.map(async_function, numbers)
    pool.close()  # 关闭进程池，不再接受新的进程
    pool.join()  # 主进程阻塞等待子进程的退出


# 重新读取获取失败链接
def get_error_a():
    # 打开文件并读取URL集合
    with open('../errors.txt', 'r') as file:
        # urls = file.readlines()
        url_set_error = {url.strip() for url in file.readlines()}  # 处理URL集合 strip()函数用于去除每行末尾的换行符

    # # 处理URL集合 strip()函数用于去除每行末尾的换行符
    # url_set = {url.strip() for url in urls}

    # 清空文件
    with open('../errors.txt', 'w') as file:
        file.truncate()

    return url_set_error


# 校验是否下载完整 (线上链接名称 对比 下载完的名称一一对应)
# 1、获取本地信息名称 html文件的名称
def get_html_filenames(directory):
    html_filenames = []
    for filename in os.listdir(directory):
        # 进行处理 [2024_2_3_开放不是万能良药.txt ==》 开放不是万能良药]
        if filename.endswith('.txt'):
            filenames = filename.split("_")
            Rename = filenames[3].replace(".txt", "")
            # 过滤掉其他特殊符号
            Rename = sanitize_filename(Rename)
            html_filenames.append(Rename)
    return html_filenames


# 2、获取页面的所有名称和链接
# 文章类：名称和链接
class UrlAndName:
    def __init__(self, title, url):  # 构造函数
        self.title = title
        self.url = url

    def get_title(self):
        return self.title

    def get_url(self):
        return self.url

    def __str__(self):
        return f'名称：{self.title}链接：{self.url}'


# 获取html中的 文章名称与链接 (要解析的html,保存在UrlAndName类中)
def get_html_aAndName(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # 获取文章列表
    all_Div = soup.findAll("div", attrs={"class": "notion-selectable notion-page-block notion-collection-item"})
    # 遍历出文章名称和链接
    html_a_name = []
    for Div in all_Div:
        soup_new = BeautifulSoup(str(Div), 'html.parser')
        # 获取a标签链接
        a = soup_new.find("a", attrs={"rel": "noopener noreferrer"})['href']
        # 获取文章标题
        html_name = soup_new.find("div", attrs={"placeholder": "Untitled"})
        html_a_name.append(UrlAndName(sanitize_filename(html_name.get_text()), a))
    return html_a_name
    # for UrlAndName in html_a_name:
    #     print(f'名称：{UrlAndName.get_title()}链接：{UrlAndName.get_url()}\n')


# 获取所有文章名称
def get_html_names(html_content):
    html_names = []
    soup = BeautifulSoup(html_content, 'html.parser')
    for divName in soup.findAll("div", attrs={"placeholder": "Untitled"}):
        html_names.append(sanitize_filename(divName.get_text()))
    return html_names


# 2、进行对比
def get_end_ok(html_a_name, filter_names):
    # 遍历获取页面类中所有标题
    html_names = []
    for UrlAndName in html_a_name:
        html_names.append(UrlAndName.get_title())

    # 页面中标题和下载表的差集
    scan = set(html_names) - set(filter_names)
    if scan:
        print("本地没有下载的文章:------" + str(len(scan)) + "------")
        # 打印缺少的文章名称和链接
        for name in scan:
            for names in html_a_name:
                if names.title == name:
                    print(names)
        print("------ 本地没有下载的文章 ------")
    else:
        print("所有文章成功下载本地")


# 获取文章列表html
def get_html_list():
    # 无界面
    # 反检测设置
    edge_options = Options()
    edge_options.add_argument('--headless')
    # 设置不加载图片策略
    prefs = {"profile.managed_default_content_settings.images": 2}
    edge_options.add_experimental_option("prefs", prefs)

    # 将参数传给浏览器
    driver = webdriver.Edge(options=edge_options)
    driver.get(URL)

    # 等待页面加载完成——为了提高成功率[显示等待-最长超时时间10s,这里我设置当图片元素img加载出来时，则认为页面加载完毕。网络较慢可以适当加超时时间]
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.TAG_NAME, 'img')))

    html_content = driver.page_source  # 渲染完后，获取HTML内容

    # 退出浏览器
    driver.quit()

    return html_content


if __name__ == '__main__':
    # # --------------------下载--------------------
    # # 单个文章下载------
    # get_html_data(url地址xxx)

    # # 批量下载------
    # 获取所有链接方法一：(不建议使用情况： 如果需要点击加载跟多文章链接，需要写selenium进行点击加载)
    all_a = get_allUrl(URL)

    # # 获取所有链接方法二：把文章列表html页面下载到本地 (建议使用，不仅快)而且避免文章很多，需要点击加载才能显示更多，加载好后保存html保证文章列表链接完整)
    # with open('../index.html', 'r', encoding='utf-8') as file:
    #     html_content = file.read()
    # soup = BeautifulSoup(html_content, 'html.parser')
    # # 获取所有a标签
    # all_a = [a['href'] for a in soup.findAll("a", attrs={"rel": "noopener noreferrer"})]

    # 使用线程池下载(一篇文章一个文本)
    use_pool(5, get_html_data, all_a)

    # # --------------------下载失败文章，重新下载---------------------
    # # 重新下载失败的链接 (失败链接处理完后，只要txt又失败链接，继续处理)【不稳定】
    # error_urls = get_error_a()
    # while error_urls:
    #     # 使用线程处理保存任务
    #     use_pool(5, get_html_data, error_urls)
    #     # 再次查看任务中有无下载失败链接
    #     error_urls = get_error_a()

    # # --------------------校验---------------------(线上链接html 对比 下载完的名称一一对应)
    # # 获取文章列表html方法一 (如果请求一次，列表还有文章，需要点击加载不建议使用，当然可以在这个方法中写入点击事件加载更多链接)
    # html_content = get_html_list()
    #
    # # 获取文章列表html (建议使用)
    # with open('../index.html', 'r', encoding='utf-8') as file:
    #     html_content = file.read()
    # # 校验是否下载完整
    # get_end_ok(get_html_aAndName(html_content), get_html_filenames(SRC))

    # --------------------------------------------------------------------------
    # --------------------下载：文章合并放入一个文本(缺点不好校验)---------------------
    # # 使用线程池下载(文章放入一个文本)
    # use_pool(5, get_html_data_one, all_a)

    # # --------------------合并下载失败文章，重新下载---------------------
    # # 重新下载失败的链接 (失败链接处理完后，只要txt又失败链接，继续处理)【不稳定】
    # error_urls = get_error_a()
    # while error_urls:
    #     # 使用线程处理保存任务
    #     use_pool(5, get_html_data_one(), error_urls)
    #     # 再次查看任务中有无下载失败链接
    #     error_urls = get_error_a()

Notion文章下载——保存html、音频

"""
    pip install selenium 版本是4以上，不然无法使用Options
    pip install requests
    pip install bs4
    pip install multiprocessing

    保存notion文章
        文章 保存整个html
        音频 如果文章有音频链接单独下载

    保存下载失败的notion文章链接——再次下载

    用文章名称对比校验是否已经下载完整

"""
import concurrent.futures
import os

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
# 在这里导入浏览器设置相关的类
from selenium.webdriver.edge.options import Options

from bs4 import BeautifulSoup
from multiprocessing.dummy import Pool

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC


# 过滤掉名称符号
def sanitize_filename(filename):
    # 替换掉回车
    filename = filename.replace('\n', '')
    # Windows名称不能创建的符号
    illegal_chars = '<>:"/\|?*'
    for char in illegal_chars:
        filename = filename.replace(char, '')
    return filename


# 校验是否下载完整 (线上链接名称 对比 下载完的名称一一对应)
# 1、获取本地信息名称 html文件的名称
def get_html_filenames(directory):
    html_filenames = []
    for filename in os.listdir(directory):
        # 进行处理 [2024_2_3_开放不是万能良药.html ==》 开放不是万能良药]
        if filename.endswith('.html'):
            filenames = filename.split("_")
            Rename = filenames[3].replace(".html", "")
            # 过滤掉其他特殊符号
            Rename = sanitize_filename(Rename)
            html_filenames.append(Rename)
    return html_filenames


# 2、获取页面的所有名称和链接
# 文章类：名称和链接
class UrlAndName:
    def __init__(self, title, url):  # 构造函数
        self.title = title
        self.url = url

    def get_title(self):
        return self.title

    def get_url(self):
        return self.url

    def __str__(self):
        return f'名称：{self.title}链接：{self.url}'


# 获取html中的 文章名称与链接 (要解析的html,保存在UrlAndName类中)
def get_html_aAndName(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    # 获取文章列表
    all_Div = soup.findAll("div", attrs={"class": "notion-selectable notion-page-block notion-collection-item"})
    # 遍历出文章名称和链接
    html_a_name = []
    for Div in all_Div:
        soup_new = BeautifulSoup(str(Div), 'html.parser')
        # 获取a标签链接
        a = soup_new.find("a", attrs={"rel": "noopener noreferrer"})['href']
        # 获取文章标题
        html_name = soup_new.find("div", attrs={"placeholder": "Untitled"})

        # # 内容在div中的span中不打紧，get_text()可以获取div中所有文本信息
        # if "span" in html_name:  # 如果div下面还有一级span 就获取span的标题
        #     nameSpan = BeautifulSoup(html_name, 'html.parser')
        #     # 获取span中标题，过滤掉特殊字符
        #     name = sanitize_filename(nameSpan.find("span").text)
        # else:
        #     # 过滤掉特殊字符
        #     name = sanitize_filename(html_name.get_text())
        # # 保存标题和链接
        # html_a_name.append(UrlAndName(name, a))

        html_a_name.append(UrlAndName(sanitize_filename(html_name.get_text()), a))
    return html_a_name
    # for UrlAndName in html_a_name:
    #     print(f'名称：{UrlAndName.get_title()}链接：{UrlAndName.get_url()}\n')


# 获取所有文章名称
def get_html_names(html_content):
    html_names = []
    soup = BeautifulSoup(html_content, 'html.parser')
    for divName in soup.findAll("div", attrs={"placeholder": "Untitled"}):
        # if "span" in divName:
        #     nameSpan = BeautifulSoup(divName, 'html.parser')
        #     name = sanitize_filename(nameSpan.find("span").text)
        #     html_names.append(name)
        # else:
        #     # 过滤掉特殊字符
        #     name = sanitize_filename(divName.get_text())
        #     html_names.append(name)

        html_names.append(divName.get_text())
    return html_names


# 2、进行对比
def get_end_ok(html_a_name, filter_names):
    # 遍历获取页面类中所有标题
    html_names = []
    for UrlAndName in html_a_name:
        html_names.append(UrlAndName.get_title())

    # 页面中标题和下载表的差集
    scan = set(html_names) - set(filter_names)
    if scan:
        print("本地没有下载的文章:------" + str(len(scan)) + "------")
        # 打印缺少的文章名称和链接
        for name in scan:
            for names in html_a_name:
                if names.title == name:
                    print(names)
        print("------ 本地没有下载的文章 ------")
    else:
        print("所有文章成功下载本地")


# 重新读取获取失败链接
def get_error_a():
    # 打开文件并读取URL集合
    with open('../errors.txt', 'r') as file:
        # urls = file.readlines()
        url_set_error = {url.strip() for url in file.readlines()}  # 处理URL集合 strip()函数用于去除每行末尾的换行符

    # # 处理URL集合 strip()函数用于去除每行末尾的换行符
    # url_set = {url.strip() for url in urls}

    # 清空文件
    with open('../errors.txt', 'w') as file:
        file.truncate()

    return url_set_error


# 月份匹配
def month_to_number(month_name):
    months = {
        'January': 1, 'February': 2, 'March': 3,
        'April': 4, 'May': 5, 'June': 6,
        'July': 7, 'August': 8, 'September': 9,
        'October': 10, 'November': 11, 'December': 12
    }
    return months.get(month_name)


# 下载文章方法
def get_html_data(url):
    # 指定保存本地目录(注意改)
    src = "C:\\Users\\23938\\Desktop\\txt\\"

    # 无界面
    # 反检测设置
    edge_options = Options()
    edge_options.add_argument('--headless')
    # 设置不加载图片策略
    prefs = {"profile.managed_default_content_settings.images": 2}
    edge_options.add_experimental_option("prefs", prefs)
    # # 开启开发者模式
    # edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # # 禁用启用Blink运行时的功能
    # edge_options.add_argument('--disable-blink-features=AutomationControlled')
    # 将参数传给浏览器

    try:
        driver = webdriver.Edge(options=edge_options)

        # 发起请求
        driver.get(url)

        # 等待页面加载完成——为了提高成功率[显示等待-最长超时时间10s,这里我设置当图片元素img加载出来时，则认为页面加载完毕。网络较慢可以适当加超时时间]
        WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.TAG_NAME, 'img')))

        html_content = driver.page_source  # 渲染完后，获取HTML内容

        # 获取html成功后，关闭浏览器
        driver.quit()

        # 使用BeautifulSoup 解析页面
        soup = BeautifulSoup(html_content, 'html.parser')

        # 获取文章标题
        title = soup.title.string
        # 过滤掉特殊符号
        title = sanitize_filename(title)

        # 获取文章时间(查找div中style样式是line-height: 1.5; word-break: break-word; white-space: pre-wrap; display: inline;)
        timeDiv = soup.find("div", attrs={
            "style": "line-height: 1.5; word-break: break-word; white-space: pre-wrap; display: inline;"})
        timeText = timeDiv.get_text()  # 只要div的文本内容
        # 转换为我喜欢的日期格式 April 1, 2023 ==> 2023_4_1
        if "," in timeText:
            timeRep = timeText.replace(",", "")  # 去除 ,号
            words = timeRep.split()  # 按空格进行分隔
            month_num = month_to_number(words[0])  # 英文月份匹配数字
            timeText = words[2] + '_' + str(month_num) + '_' + words[1]
        elif "/" in timeText:
            timeText = timeText.replace("/", '_') # 2023/4/1 ==> 2023_4_1

        # 获取音频并下载
        audio_tags = soup.findAll('audio')
        for audio_tag in audio_tags:
            # 获取音频文件的src属性，即音频文件的URL
            audio_url = audio_tag['src']
            # 发送请求获取音频文件
            audio_response = requests.get(audio_url)
            # 保存音频到本地，用with语句块打开文件就不需要再手动关闭文件了
            with open(src + timeText + '_' + title + '.mp3', 'wb') as file:
                file.write(audio_response.content)

        # 保存html-文章信息
        with open(src + timeText + '_' + title + '.html', 'w', encoding='utf-8') as file:
            file.write(html_content)  # 将HTML内容写入文件

        print("文章" + title + "下载完成")

    except Exception as e:
        print("下载失败的文章:" + title + url + "\n" + e)
        # 获取失败链接信息保存到txt
        with open('../errors.txt', 'a') as f:
            f.write(f'{url}\n')


# 获取页面所有文章链接
def get_allUrl(url):
    # 无界面
    # 反检测设置
    edge_options = Options()
    edge_options.add_argument('--headless')
    # 设置不加载图片策略
    prefs = {"profile.managed_default_content_settings.images": 2}
    edge_options.add_experimental_option("prefs", prefs)
    # # 开启开发者模式
    # edge_options.add_experimental_option('excludeSwitches', ['enable-automation'])
    # # 禁用启用Blink运行时的功能
    # edge_options.add_argument('--disable-blink-features=AutomationControlled')
    # 将参数传给浏览器
    driver = webdriver.Edge(options=edge_options)

    driver.get(url)

    # 等待页面加载完成——为了提高成功率[显示等待-最长超时时间10s,这里我设置当图片元素img加载出来时，则认为页面加载完毕。网络较慢可以适当加超时时间]
    WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.TAG_NAME, 'img')))

    html_content = driver.page_source  # 渲染完后，获取HTML内容

    # # 打开HTML文件并读取内容
    # with open('index.html', 'r', encoding='utf-8') as file:
    #     html_content = file.read()

    # 退出浏览器
    driver.quit()

    # 使用BeautifulSoup解析HTML内容
    soup = BeautifulSoup(html_content, 'html.parser')

    # 提取页面中，所有笔记的链接
    # 获取所有<a>标签的href属性值集合
    all_a = [a['href'] for a in
             soup.findAll("a", attrs={"rel": "noopener noreferrer"})]  # 找到所有rel=noopener noreferrer的a标签

    return all_a


# 使用线程池异步执行函数 参数：(线程池最大数，需要使用线程的方法，传给方法的参数) 【失败率太高】
def use_thread_pool(max, async_function, numbers):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max) as executor:
        executor.map(async_function, numbers)


# 线程池(线程池最大数，需要使用线程的方法，传给方法的参数)
def use_pool(max, async_function, numbers):
    pool = Pool(max)
    pool.map(async_function, numbers)
    pool.close()  # 关闭进程池，不再接受新的进程
    pool.join()  # 主进程阻塞等待子进程的退出


if __name__ == '__main__':
    # # 获取-------------------------
    # # 获取需要的链接方法一：请求
    all_a = get_allUrl("https://rabbit-hole.notion.site/d6bdaead3edf4d35b955490c8d079e25")

    # # 获取需要的链接方法二：从本地读取下载好的html (建议使用)
    # # 打开HTML文件并读取内容(由于上面链接浏览器获取很慢，容易出问题，我直接下载好文章链接的html)
    # with open('../index.html', 'r', encoding='utf-8') as file:
    #     html_content = file.read()
    # # 使用BeautifulSoup解析HTML内容
    # soup = BeautifulSoup(html_content, 'html.parser')
    # # 获取所有a标签
    # all_a = [a['href'] for a in soup.findAll("a", attrs={"rel": "noopener noreferrer"})]

    # # 下载---------------------------
    # 测试单个下载
    # get_html_data("https://rabbit-hole.notion.site/ea7d3498cb244d9db8cb0d28a5f45ef0?pvs=25")

    # # 线程池下载
    pool = Pool(5)
    pool.map(get_html_data, all_a)

    # 异步线程池
    # use_thread_pool(5, get_html_data, all_a)

    # # 下载失败，重新下载文章---------------
    # # 重新下载失败的链接 (失败链接处理完后，只要txt又失败链接，继续处理)【不稳定】
    # error_urls = get_error_a()
    # while error_urls:
    #     # 使用线程处理保存任务
    #     use_pool(5, get_html_data, error_urls)
    #     # 再次查看任务中有无下载失败链接
    #     error_urls = get_error_a()

    # # 线程池-失败文件重新下载
    # error_urls = get_error_a()
    # use_pool(5, get_html_data, error_urls)

    # # 校验---------------------------
    # # 校验是否下载完整 (线上链接名称 对比 下载完的名称一一对应)
    # get_end_ok(get_html_aAndName(html_content), get_html_filenames("C:\\Users\\23938\\Desktop\\txt\\"))

小白要努力变黑

关注

4
点赞
踩
8

收藏

觉得还不错? 一键收藏
打赏
0
评论
Notion文章简单下载【爬虫单个/批量】——Python+selenium

不一样的文章元素截取位置可能不同，BeautifulSoup中的find/findALL截取内容需要看网页情况进行调整。时间格式可处理形式 April 1, 2023||2023/4/1两种，其他形式需要自行修改。文章标题有过滤不能下载的特殊符符号方法sanitize_filename()。下载失败的文章，链接会保存在errors.txt文件中，查看文章是否下载完整，本地文章名称对比网页列表名称。读取后，重新下载，直到文件中没有失败链接。文章的名称和时间格式可能不同。批量下载（使用线程池）
复制链接

扫一扫