【python】将博客文章保存为markdown

骑树的猪

已于 2023-04-25 17:20:01 修改

阅读量1.7k

点赞数 3

文章标签： python 爬虫 html

于 2023-04-09 15:42:23 首次发布

本文链接：https://blog.csdn.net/m0_51499090/article/details/130042463

版权

爬取博客文章保存为md
(纯净版去除多余内容，支持多平台(CSDN,简书，知乎等)）

在这里插入图片描述

Method one:

思路：爬取目标html后利用html2text模块转化成markdown

依赖

使用pip进行安装

pip install html2text
pip install lxml
pip install requests
pip install beautifulsoup4

过程

爬取html

def get_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36",
    }
    # 配置header破反爬
    response = requests.get(url, headers=headers)
    # 200就继续
    if response.status_code == 200:
        html = response.content.decode("utf8")
        print("get html success!")
    else:
        print("failed!")
    return html

去除文章多余内容，只留下正文

def remove(html): #去除文章多余内容(乱七八糟的东西“
    soup = BeautifulSoup(html, 'lxml')  # 传入解析器：lxml
    html = soup.select('article')
    return html

创建并保存markdown文件

def save(block, title):
    if "output" not in os.listdir():
        # 不存在输出文件夹就创建
        os.mkdir("output")
        os.mkdir("output/markdown")

    with open(f"output/markdown/{title}.md", 'w', encoding='utf8') as md_file:
        # 保存markdown
        text_maker = HTML2Text()
        # md转换
        md_text = text_maker.handle(str(block[0]))
        md_file.write(md_text)

解析标题，调用上述函数完成爬取

def crawl(html):
        tree = etree.HTML(html)
        # 找到需要的html块
        title = re.sub('[\/:*?"<>|]', '-', tree.xpath('//*[@id="articleContentId"]/text()')[0])
        print("title:", title)
        block = remove(html)
        save(block, title)
        print("finish!")
        # 完成！

完整代码

from html2text import HTML2Text
from bs4 import BeautifulSoup
from lxml import etree
import requests
import os
import re


def remove(html): #去除文章多余内容(乱七八糟的东西“
    soup = BeautifulSoup(html, 'lxml')  # 传入解析器：lxml
    html = soup.select('article')
    return html

def get_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36",
    }
    # 配置header破反爬
    response = requests.get(url, headers=headers)
    # 200就继续
    if response.status_code == 200:
        html = response.content.decode("utf8")
        print("get html success!")
    else:
        print("failed!")
    return html



def crawl(html):
        tree = etree.HTML(html)
        # 找到需要的html块
        title = re.sub('[\/:*?"<>|]', '-', tree.xpath('//*[@id="articleContentId"]/text()')[0])
        print("title:", title)
        block = remove(html)
        save(block, title)
        print("finish!")
        # 完成！



def save(block, title):
    if "output" not in os.listdir():
        # 不存在输出文件夹就创建
        os.mkdir("output")
        os.mkdir("output/markdown")

    with open(f"output/markdown/{title}.md", 'w', encoding='utf8') as md_file:
        # 保存markdown
        text_maker = HTML2Text()
        # md转换
        md_text = text_maker.handle(str(block[0]))
        md_file.write(md_text)


if __name__ == '__main__':
    #单篇文章# 你想要爬取的文章url
    url = input("输入目标url:")
    crawl(get_html(url))

缺点：html2text模块转化成markdown会出现少量失真，需人工调整。

Method two:

思路：调用网站HTML/URL To Markdown的功能实现转化

完整代码

from lxml import etree
import requests
import os
import re


def get_html(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36",
    }
    # 配置header破反爬
    response = requests.get(url, headers=headers)
    # 200就继续
    if response.status_code == 200:
        html = response.content.decode("utf8")
        print("get html success!")
    else:
        print("failed!")
    return html


def save(html,md):
    tree = etree.HTML(html)
    # 找到需要的html块
    title = re.sub('[\/:*?"<>|]', '-', tree.xpath('//*[@id="articleContentId"]/text()')[0])
    if "output" not in os.listdir():
        # 不存在输出文件夹就创建
        # os.mkdir("output")
        os.mkdir("output/markdown")

    with open(f"output/markdown/{title}.md", 'w', encoding='utf8') as md_file:
        md_file.write(md)

def crawl(objurl):
    url = "https://devtool.tech/api/html-md"
    # 请求头
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36",
    }
    # body数据

    data = {"url": objurl}
    # 发送请求
    r = requests.post(url, headers=headers, data=data)
    # 判断是否登录成功
    if r.text is not None:
        print("get markdown success!")
    else:
        print("failed!")

    html = get_html(objurl)
    md = eval(r.text)['markdown']
    save(html, md)



if __name__ == '__main__':
    objurl = input("输入目标url:")
    crawl(objurl)