用 python 爬虫得到 csdn 博客的文章链接和标题

最新推荐文章于 2024-03-22 07:50:50 发布

YKenan

最新推荐文章于 2024-03-22 07:50:50 发布

阅读量308

点赞数 1

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/YKenan/article/details/99973451

版权

python 专栏收录该内容

27 篇文章 6 订阅

订阅专栏

用 python 爬虫得到 csdn 博客的文章链接和标题

- 1. 代码
- 2. 测试

1. 代码

代码

#!/usr/bin/env python
# -*- coding: UTF-8 -*-

# 导报
import numpy as np
import pandas as pd
import requests
import re
import math


# 得到数量和页数
def get_number(blog_name):
    # UA 请求载体的身份标识, UA 检测, UA 伪装
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0"
    }
    # 爬取数据
    response = requests.get(url=f"https://blog.csdn.net/{blog_name}", headers=headers)
    # 得到数据内容
    text = response.text
    # 去掉 \r \n \t
    content = re.sub("[\r\n\t]", "", text)
    # 多个空格只留下一个空格
    content = re.sub(" +", " ", content)
    print(content)
    # 得到数量
    re_com = re.compile('id="container-header-blog" data-type="\d{1,9}"><span>博客\(\d{1,9}\)</span></li>')
    all_number = re_com.findall(content)[0]
    # 得到页数
    re_com = re.compile('<h4 class=""> <a href="https://blog.csdn.net/' + blog_name + '/article/details/\d{1,20}" target="_blank"> <span class="article-type type-\d float-none">')
    pages = re_com.findall(content)
    # 得到数量
    return int(re.compile("\d{1,9}").findall(all_number)[0]), len(pages)


# 函数
def getCSDN(blog_name, path):
    # create a container
    columns = ["链接", "文章"]
    df = pd.DataFrame(columns=columns)
    # 得到数量和页数
    blog_number, blog_pages = get_number(blog_name)

    # 遍历页数
    for page in range(1, math.ceil(blog_number / blog_pages) + 1):
        # UA 请求载体的身份标识, UA 检测, UA 伪装
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:82.0) Gecko/20100101 Firefox/82.0"
        }
        # 爬取数据
        response = requests.get(url=f"https://blog.csdn.net/{blog_name}/article/list/{page}", headers=headers)
        # 得到数据内容
        text = response.text
        # 去掉 \r \n \t
        content = re.sub("[\r\n\t]", "", text)
        # 多个空格只留下一个空格
        content = re.sub(" +", " ", content)
        data_list = np.NaN
        try:
            # the data is obtained by regular expression
            data = re.compile('https://blog.csdn.net/' + blog_name + '/article/details/\d{1,20}" target="_blank"> <span class="article-type type-\d float-none">\w{0,3}</span> [\w\"\'\\\[\]\- ~!@#$%^&*()_+！￥…（）—=`·{}:|;：“”；‘’、?,./《》？，。]+')
            # the data is obtained by regular expression
            data_list = data.findall(content)
        except ValueError:
            pass
        for item in np.array(data_list).flat:
            # Remove the space on the right
            content_str = "".join(item).rstrip()
            # Converting unrelated strings into substitutes
            content_str = re.sub('" target="_blank"> <span class="article-type type-\d float-none">\w{0,3}</span> ', "+++++++", content_str)
            print(content_str)
            # Add content to df
            df.loc[len(df)] = pd.Series([content_str.split("+++++++")[0], content_str.split("+++++++")[1]], index=columns)
    # Output file, encoding ='utf_8_sig': Preventing Chinese scrambling
    df.to_csv(path, encoding='utf_8_sig', index=False)


if __name__ == '__main__':
    getCSDN("YKenan", "./data/csdn/Ykenan.csv")

结果

在这里插入图片描述

2. 测试

比如: 我关注的博客: yuzhenling

if __name__ == '__main__':
    getCSDN("yuzhenling", "./data/csdn/yuzhenling.csv")

结果

在这里插入图片描述

YKenan

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录

用 python 爬虫 得到 csdn 博客的文章链接和标题

用 python 爬虫 得到 csdn 博客的文章链接和标题

1. 代码

2. 测试

用 python 爬虫得到 csdn 博客的文章链接和标题

用 python 爬虫得到 csdn 博客的文章链接和标题