import os
import html2text
import requests
from bs4 import BeautifulSoup
from parsel import Selector
from Novel import headers
def get_all_article_id(csdn):
articleid = []
page = 1
while True:
content = requests.get('%s/article/list/%s' % (csdn, page),
headers=headers).content.decode(errors='ignore')
soup = BeautifulSoup(content)
links = soup.select('.article-list [data-articleid]')
if not links:
break
else:
for link in links:
articleid.append(link.attrs['data-articleid'])
page += 1
return articleid
def get_all_article(csdn):
for article_id in get_all_article_id(csdn):
article_url = 'https://blog.csdn.net/mbh12333/article/details/%s' % article_id
get_article(article_url)
def get_article(article_url):
__down_article(article_url)
def __down_article(article_url):
content = requests.get(article_url, headers=headers).content.decode(errors='ignore')
soup = BeautifulSoup(content)
sel = Selector(text=content)
user_id = sel.css("#uid::text").get()
title = str(soup.select('h1.title-article')[0].string)
content = str(soup.select('#content_views')[0].prettify())
h = html2text.HTML2Text()
content = h.handle(content)
path = './%s' % user_id
if not os.path.exists(path):
os.mkdir(path)
file = '%s/%s.md' % (path, title)
if not os.path.exists(file):
with open(file, 'w', encoding='utf-8') as f:
f.write(content)
print("%s 下载成功!" % file)
else:
print("%s 已存在!" % file)
if __name__ == '__main__':
# get_all_article('https://blog.csdn.net/mbh12333')
get_article('https://blog.csdn.net/u010842515/article/details/65443084')
支持下载单个博客,和 批量下载用户下所有博客