1. 代码
代码
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
# 导报
import numpy as np
import pandas as pd
import requests
import re
import math
# 得到数量和页数
def get_number(blog_name):
# UA 请求载体的身份标识, UA 检测, UA 伪装
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0"
}
# 爬取数据
response = requests.get(url=f"https://blog.csdn.net/{blog_name}", headers=headers)
# 得到数据内容
text = response.text
# 去掉 \r \n \t
content = re.sub("[\r\n\t]", "", text)
# 多个空格只留下一个空格
content = re.sub(" +", " ", content)
print(content)
# 得到数量
re_com = re.compile('id="container-header-blog" data-type="\d{1,9}"><span>博客\(\d{1,9}\)</span></li>')
all_number = re_com.findall(content)[0]
# 得到页数
re_com = re.compile('<h4 class=""> <a href="https://blog.csdn.net/' + blog_name + '/article/details/\d{1,20}" target="_blank"> <span class="article-type type-\d float-none">')
pages = re_com.findall(content)
# 得到数量
return int(re.compile("\d{1,9}").findall(all_number)[0]), len(pages)
# 函数
def getCSDN(blog_name, path):
# create a container
columns = ["链接", "文章"]
df = pd.DataFrame(columns=columns)
# 得到数量和页数
blog_number, blog_pages = get_number(blog_name)
# 遍历页数
for page in range(1, math.ceil(blog_number / blog_pages) + 1):
# UA 请求载体的身份标识, UA 检测, UA 伪装
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0"
}
# 爬取数据
response = requests.get(url=f"https://blog.csdn.net/{blog_name}/article/list/{page}", headers=headers)
# 得到数据内容
text = response.text
# 去掉 \r \n \t
content = re.sub("[\r\n\t]", "", text)
# 多个空格只留下一个空格
content = re.sub(" +", " ", content)
data_list = np.NaN
try:
# the data is obtained by regular expression
data = re.compile('https://blog.csdn.net/' + blog_name + '/article/details/\d{1,20}" target="_blank"> <span class="article-type type-\d float-none">\w{0,3}</span> [\w\"\'\\\[\]\- ~!@#$%^&*()_+!¥…()—=`·{}:|;:“”;‘’、?,./《》?,。]+')
# the data is obtained by regular expression
data_list = data.findall(content)
except ValueError:
pass
for item in np.array(data_list).flat:
# Remove the space on the right
content_str = "".join(item).rstrip()
# Converting unrelated strings into substitutes
content_str = re.sub('" target="_blank"> <span class="article-type type-\d float-none">\w{0,3}</span> ', "+++++++", content_str)
print(content_str)
# Add content to df
df.loc[len(df)] = pd.Series([content_str.split("+++++++")[0], content_str.split("+++++++")[1]], index=columns)
# Output file, encoding ='utf_8_sig': Preventing Chinese scrambling
df.to_csv(path, encoding='utf_8_sig', index=False)
if __name__ == '__main__':
getCSDN("YKenan", "./data/csdn/Ykenan.csv")
结果
2. 测试
比如: 我关注的博客: yuzhenling
if __name__ == '__main__':
getCSDN("yuzhenling", "./data/csdn/yuzhenling.csv")
结果