Python实现 CSDN博客下载并保存为Markdown文件可以转epub

最新推荐文章于 2024-08-16 08:59:59 发布

编码行者

最新推荐文章于 2024-08-16 08:59:59 发布

阅读量763

点赞数 1

分类专栏： python

本文链接：https://blog.csdn.net/mbh12333/article/details/100632409

版权

python 专栏收录该内容

33 篇文章 3 订阅

订阅专栏

import os

import html2text
import requests
from bs4 import BeautifulSoup
from parsel import Selector

from Novel import headers


def get_all_article_id(csdn):
    articleid = []
    page = 1
    while True:
        content = requests.get('%s/article/list/%s' % (csdn, page),
                               headers=headers).content.decode(errors='ignore')
        soup = BeautifulSoup(content)
        links = soup.select('.article-list [data-articleid]')
        if not links:
            break
        else:
            for link in links:
                articleid.append(link.attrs['data-articleid'])
        page += 1
    return articleid


def get_all_article(csdn):
    for article_id in get_all_article_id(csdn):
        article_url = 'https://blog.csdn.net/mbh12333/article/details/%s' % article_id
        get_article(article_url)


def get_article(article_url):
    __down_article(article_url)


def __down_article(article_url):
    content = requests.get(article_url, headers=headers).content.decode(errors='ignore')
    soup = BeautifulSoup(content)
    sel = Selector(text=content)
    user_id = sel.css("#uid::text").get()
    title = str(soup.select('h1.title-article')[0].string)
    content = str(soup.select('#content_views')[0].prettify())
    h = html2text.HTML2Text()
    content = h.handle(content)
    path = './%s' % user_id
    if not os.path.exists(path):
        os.mkdir(path)
    file = '%s/%s.md' % (path, title)
    if not os.path.exists(file):
        with open(file, 'w', encoding='utf-8') as f:
            f.write(content)
        print("%s 下载成功！" % file)
    else:
        print("%s 已存在！" % file)


if __name__ == '__main__':
    # get_all_article('https://blog.csdn.net/mbh12333')
    get_article('https://blog.csdn.net/u010842515/article/details/65443084')