CSDN爬虫
1. 准备模块 要求
2. 请求数据
3. 解析数据
4. 写入csv文件
文章的ID 文章的链接 文章的摘要 用户名 点赞数 浏览数 评论数
# -*- coding: UTF-8 -*-
import requests
from bs4 import BeautifulSoup
import json
import time
import csv
""""""
"""
文章的ID 文章的链接 文章的摘要 用户名 点赞数 浏览数 评论数
"""
请求数据
def get_html(url, headers):
# 发送请求
response = requests.get(url, headers=headers)
# 查看响应状态码
if response.status_code == 200:
return response.text
解析数据
def parse_html1(json_data):
json_data = json.loads(json_data)
shown_offset = json_data["shown_offset"]
data = []
for art in json_data["articles"]:
# 文章ID
art_id = art.get('id')
# 文章创建时间
created_at = art.get('created_at')
# 昵称
nickname = art.get('nickname')
# 用户名
user_name = art.get('user_name')
# 文章标题
title = art.get('title')
# 评论数
comments_num = art.get('comments', 0)
# 浏览数
views = art.get('views')
# 摘要/简介
desc = art.get('desc', '无')
# 点赞数
digg = art.get('digg', 0)
# 文章链接
art_url = art.get('url')
data.append([art_id, title, art_url, desc, nickname, user_name, views, digg, comments_num, created_at])
# 返回数据
return data, shown_offset
写入csv文件
def save_csv(data):
with open('CSDN数据.csv', 'a', newline='', encoding='utf-8') as f:
w = csv.writer(f)
for i in data:
w.writerow(i)
if __name__ == "__main__":
shown_offset = str(time.time()).replace('.', '')
print(shown_offset)
for i in range(10):
# 获取网页
# 准备url
url = f'https://www.csdn.net/api/articles?type=more&category=python&shown_offset={shown_offset}'
# 请求头header - ua cookie referer
headers = {
'user-agent': "U A",
'cookie': "cookie",
"referer":"referer"
}
json_data = get_html(url, headers)
data, shown_offset = parse_html1(json_data)
save_csv(data)