数据获取
- 从推荐页面获取文章链接
从网页资源分析,得到推荐页面的文章信息
链接:https://cms-api.csdn.net/v1/web_home/select_content?componentIds=www-blog-recommend
- 获取到文章url
info = request.json()['data']['www-blog-recommend']['info']
urls = [d['extend']['url'] for d in info]
- 获取文章具体信息
由下图可见,可以直接访问html,然后来获取文章信息,可以直接获取html来展示,我这边是只获取内容
def get_blog_info(url, header):
request = requests.get(url=url, headers=header)
soup = BeautifulSoup(request.text, 'html.parser')
"""
主体内容:#mainBox > main > div.blog-content-box
标题:#articleContentId
标签:#mainBox > main > div.blog-content-box > div.article-header-box > div > div.article-info-box > div.blog-tags-box > div > a:nth-child(n+2)
内容:#content_views
"""
blog_title_text = soup.select("#articleContentId")[0].text
blog_label_list = [label.text for label in soup.select(
"#mainBox > main > div.blog-content-box > div.article-header-box > div > div.article-info-box > div.blog-tags-box > div > a:nth-child(n+2)")]
blog_content = soup.select("#content_views")[0] # 获取第一个匹配的标签
blog_content_text = blog_content.get_text(strip=True).replace("\n", " ")
return blog_title_text, blog_label_list, blog_content_text
- 由于主页推荐出现重复的比较多,所以可以将出现的作者的所以文章拔下来(可以自己分析一下)
完整代码:
import requests
from bs4 import BeautifulSoup
# 初次读取已存在的文章链接
def get_exist_blog_url(filename):
exist_url = []
with open(filename, 'r') as file:
for line in file:
exist_url.append(line.strip())
return exist_url
def save_data(content, file_name):
with open(file_name, 'a', encoding='utf-8') as file:
file.write(content + '\n')
def get_blog_info(url, header):
request = requests.get(url=url, headers=header)
soup = BeautifulSoup(request.text, 'html.parser')
"""
主体内容:#mainBox > main > div.blog-content-box
标题:#articleContentId
标签:#mainBox > main > div.blog-content-box > div.article-header-box > div > div.article-info-box > div.blog-tags-box > div > a:nth-child(n+2)
内容:#content_views
"""
blog_title_text = soup.select("#articleContentId")[0].text
blog_label_list = [label.text for label in soup.select(
"#mainBox > main > div.blog-content-box > div.article-header-box > div > div.article-info-box > div.blog-tags-box > div > a:nth-child(n+2)")]
blog_content = soup.select("#content_views")[0] # 获取第一个匹配的标签
blog_content_text = blog_content.get_text(strip=True).replace("\n", " ")
return blog_title_text, blog_label_list, blog_content_text
def get_user_blog_list(username, header, url_filename, exist_urls):
page = 1 # 页数从第一页开始,每页20条数据
content_filename = "content_filename.txt"
label_filename = "label_filename.txt"
while True:
url = f"https://blog.csdn.net/community/home-api/v1/get-business-list?page={page}&size=20&businessType=blog&orderby=&noMore=false&year=&month=&username={username}"
request = requests.get(url=url, headers=header)
# 由于这url访问时有校验,所以当出现检测的时候,添加Referer重新访问
if request.status_code == 521:
header["Referer"] = url
request = requests.get(url=url, headers=header)
# 获取数据
data_list = request.json()['data']['list']
tem_urls = [d['url'] for d in data_list]
# 下面有两个结束条件,一个是没有数据了,一个是这个用户已经保存的文章数量太多了,说明很有可能访问过了
if len(tem_urls) == 0:
break
exist = sum(item in tem_urls for item in exist_urls)
if exist > 10:
break
page += 1
# 开始保存数剧
for blog_url in tem_urls:
title, label, content = get_blog_info(blog_url, header)
# 当文件不存在时才保存
if blog_url not in exist_urls:
# 保存url
save_data(blog_url, url_filename)
# 保存标签
save_data(title + " " + ",".join(label), label_filename)
# 保存内容
save_data(content, content_filename)
exist_urls.append(blog_url)
print(f"文章{title}保存成功")
else:
print(f"文章{title}已经存在")
if __name__ == '__main__':
header = {
"User-Agent": "自己去浏览器复制"
}
session = requests.session()
url_filename = "url_filename.txt"
url = "https://cms-api.csdn.net/v1/web_home/select_content?componentIds=www-blog-recommend"
exist_username = [] # 记录已经存在的用户,下次出现他就不执行了
while True:
# 获取推荐页面的信息
request = session.get(url=url, headers=header)
info = request.json()['data']['www-blog-recommend']['info']
# 由于推荐页面的数据很经常重复,所以直接把推荐页面中作者的全部拔下来
usernames = [d['extend']['user_name'] for d in info]
for username in usernames:
if username not in exist_username:
get_user_blog_list(username, header, url_filename, exist_urls=get_exist_blog_url(url_filename))
记得修改User-Agent
修改时间:2024-03-24