需求
Python爬取某个账号CSDN博客所有文章的标题,类型,创建时间,阅读数量,并将结果保存至Excel。
分析
CSDN主页URL为:https://blog.csdn.net/seanyang_/article/list/1
根据url可以得到其他页数的链接在https://blog.csdn.net/seanyang_/article/list/页数
主页F12查看元素,可以看到每一个文章列表所在class为article-list
每一篇文章所在class为article-item-box,如图可以herf,文章标题,创建时间,文章阅读数
Requests获取内容
Requests发送请求,获取网页内容,这里注意需要加上请求头。
self.headers = {"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36"
"(KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
}
response = requests.get(f'{self.url}/{page_num}', headers=self.headers)
response.encoding = response.apparent_encoding
response.raise_for_status()
content = response.content
BeautifulSoup解析内容
BeautifulSoup根据标签的属性和属性值获取到想要的内容。
soup = BeautifulSoup(content, 'html.parser')
article_table = soup.find('div', class_='article-list')#获取到文章列表
for article in article_table.find_all('div', class_='article-item-box'):#对于每一个文章
article_type = article.a.span.text #逐级查找标签,找到文章类型(感觉BS很灵活啊)
article_url = article.a["href"]
article_span = article.a.find_all('span')
for i in article_span:
i.clear()#清空span标签然后获取标题
article_name = article.a.get_text().strip() #获取文章名称
create_date = article.find('div', class_='info-box').find('span', class_='date').text.strip()#获取到date这里查找标签操作真厉害
read_num= article.find('div', class_='info-box').find('span', class_='read-num').text #查找到阅读量,实际页面无评论数量所以这里偷懒乐
comment_num=read_num
print(article_type, article_name, article_url, create_date, read_num, comment_num)#输出值
结果写入CSV文件
with open('csdn_crawler.csv','w',encoding='utf-8')as f:
write=csv.DictWriter(f,fieldnames=['type','href','name','date','view_num'])#写入CSV文件的列名行:
write.writeheader()
write.writerows(result_list)#将包含字典的列表全部写入到CSV文件中
运行结果
完整代码
import requests
import re
import csv
from bs4 import BeautifulSoup
import requests
headers = {"user-agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36"
}
url="https://blog.csdn.net/seanyang_/article/list"
result_list=[]
for page_num in range(1, 100):
response = requests.get(f'{url}/{page_num}', headers=headers)
response.encoding = response.apparent_encoding
content = response.content
soup = BeautifulSoup(content, 'html.parser')
article_table = soup.find('div', class_='article-list') # 获取到文章列表
#抓大放小,先把每一个item拿出来,再从item里拿细节。
if article_table==None:
break
for article in article_table.find_all('div', class_='article-item-box'):
result={}
result['type']=article.a.span.text
result['href']=article.a["href"]
article_span = article.a.find_all('span')
for i in article_span:
i.clear() # ???
result['name']=article.a.get_text().strip()
result['date']=article.find('div', class_='info-box').find('span', class_='date').text.strip()
result['view_num']=article.find('div', class_='info-box').find('span', class_='read-num').text
result_list.append(result)
print(result_list)
with open('csdn_crawler.csv','w',encoding='utf-8')as f:
write=csv.DictWriter(f,fieldnames=['type','href','name','date','view_num'])#写入CSV文件的列名行:
write.writeheader()
write.writerows(result_list)#将包含字典的列表全部写入到CSV文件中