爬取CSDN博主文章列表，练习

最新推荐文章于 2024-06-18 00:46:30 发布

qq_39454665

最新推荐文章于 2024-06-18 00:46:30 发布

阅读量265

点赞数

分类专栏： python 文章标签：爬虫 python

本文链接：https://blog.csdn.net/qq_39454665/article/details/120507437

版权

python 专栏收录该内容

4 篇文章 1 订阅

订阅专栏

该代码实现了一个简单的Python爬虫，用于从CSDN博客抓取指定博主的文章列表，包括文章标题、链接、时间、阅读量和评论数等信息，并将这些数据保存到CSV文件中。通过使用BeautifulSoup解析HTML页面，获取文章信息，并利用openpyxl库创建带超链接的Excel表格。此外，还提供了自定义请求头以模拟不同浏览器访问，防止被网站屏蔽。

摘要由CSDN通过智能技术生成

跟着擦哥、擦姐的系列文章，一步步向前走。又好久没去读文章了…写的真是太好了~

这就是个简单的爬取工作，增加一下原创文章数量+1

访问博主首页
爬取文章列表，保存在Excel里，且文章标题直接关联了超链接

就是简单的把图里的内容，转换成了 excel，哈哈。
在这里插入图片描述
那，看图。

获取擦姐的博客首页文章地址url
https://dream.blog.csdn.net/article/list/1（第一页可有可无‘/article/list/1’）
https://dream.blog.csdn.net/article/list/2
https://dream.blog.csdn.net/article/list/3
https://dream.blog.csdn.net/article/list/4
很明显看出地址的一丢丢变化
然后就可以F12，进行节点抓取拉。

直接上源码，太简单了，没啥好看的

import sys, os
__dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, '../common')))
# print(__dir__)
from util import get_html
from excel import excel
# 爬取csdn博主文章，并保存？
import time
from bs4 import BeautifulSoup

# 文章列表
def get_ats(html, ats):
    """
    :param :html 网页源码
    :param :ats  即将要导出excel的内容
    """
    soup = BeautifulSoup(html, 'html.parser')
    box = soup.select('div .article-item-box')
    if box:
        for i in range(len(box)):
            item = box[i]
            at = []
            # 标题
            a = item.select('a')[0]
            # 避免标题/文章头内容的，\t\n && 英文逗号，影响csv的分割，这里替换掉
            title = a.text.strip().replace('\n','').replace(',','，')
            titles = title.split('          ')
            at.append(titles[0])
            at.append(titles[1])
            at.append(a['href'])
            # print(title, title.split('          ')[1])
            ps = item.select('p')

            # 文章头，时间，阅读，评论
            for i in range(len(ps)):
                if i == 1:
                    for j in ps[i].select('span'):
                        at.append(j.text)
                else:
                    at.append(ps[i].text.strip().replace(',','，'))
            # print(at)
            ats.append(at)
    return ats

# 保存csv
def wrt(ats):
    name = str(round(time.time()))
    with open(f'./files/{name}.csv', 'a+', encoding='utf-8') as f:
        for i in range(len(ats)):
            f.write(','.join(ats[i]))
            f.write('\n')
        #print(','.join(i))

if __name__ == '__main__':
    # 表头
    ats = [['创作','标题','链接','文章头','时间','阅读','评论']]

    
    # 博主首页链接，擦姐的域名，很强。带有一个dream，普通人就是 https://blog.csdn.net/博主名称
    blog_url = 'https://dream.blog.csdn.net'
    # blog_url = 'https://blog.csdn.net/博主名称'
    
    # 自行查看，博主首页文章的页数
    pages = 2

    # 循环获取每一页
    for i in range(1, pages+1):
        url = f'{blog_url}/article/list/{i}'
        print(f'爬取。。。{url}')
        html = get_html(url)
        ats = get_ats(html, ats)

    # 保存csv，保存后如若用excel打开，则需先用记事本打开，另存为 ANSI ， 才不会乱码
    # wrt(ats)

    # 保存excel
    ex = excel()
    # with_url , 哪一列下标内容带url，ats数组改列后面必须紧接着url
    ex.write_row(ats, with_url=1)
    #ex.wbac.cell(row=1, column=1).value = '=HYPERLINK("{}", "{}")'.format('https://www.baidu.com', "Link Name")
    ex.save()

Excel，有的封装，封装了个寂寞，尝试封装

from openpyxl import Workbook, load_workbook
import time

class excel():

    def __init__(self) -> None:
        self.wb = Workbook()
        self.wbac = self.wb.active

    def get_sheet_names(self):
        return self.wb.get_sheet_names()
    # 向sheet中写入一行数据
    def write_row(self, list, sheet_name='Sheet', with_url=None):
        # sheets = self.get_sheet_names()
        sheet = self.wb.get_sheet_by_name(sheet_name)
        #sheet = wb.get_sheet_by_name(sheets[0])
        
        if sheet:
            # row = [y for item in list for y in item]
            # sheet.append(row)
            for i in range(len(list)):
                # ll = []
                row = list[i]
                for y in range(len(row)):
                    content = row[y]
                    if with_url == y and i > 0:
                        # print(i,y)
                        # 替换成链接
                        # wu = '=HYPERLINK("{}", "{}")'.format(row[y+1], content)
                        list[i][y] = '=HYPERLINK("{}", "{}")'.format(row[y+1], content)
                        # v = self.wbac.cell(row=i+1, column=y+1).value = '=HYPERLINK("{}", "{}")'.format(row[y+1], content)
                #         ll.append(wu)
                #         continue
                #     else:
                #         ll.append(content)
                # print(ll)
                # sheet.append(ll)
                sheet.append(row)

    def new_sheet(self, name):
        self.wb.create_sheet(name)

    def save(self, path=f"./files/Excel{str(round(time.time()*1000))}.xlsx"):
        self.wb.save(path)

    # def __call__(self, list):
    #     self
    #     pass

封装的工具类，无用的import删掉

import os, time, requests, random, telnetlib, json, pypinyin
from bs4 import BeautifulSoup
__dir__ = os.path.dirname(os.path.abspath(__file__))
# print(__dir__)

def get_headers(localhost=True, refer="https://www.baidu.com", host=None):

	ua = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36"
	if not localhost:
		uas = [
			"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36",
			"Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)",
			"Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)",
			"Baiduspider-image+(+http://www.baidu.com/search/spider.htm)",
			"Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
			"Mozilla/5.0 (compatible; Googlebot-Image/1.0; +http://www.google.com/bot.html)",
			"Sogou web spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
			"Sogou News Spider/4.0(+http://www.sogou.com/docs/help/webmasters.htm#07)",
			"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",
			"Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
			"Sosospider+(+http://help.soso.com/webspider.htm)",
			"Mozilla/5.0 (compatible; Yahoo! Slurp China; http://misc.yahoo.com.cn/help.html)"
		]
		ua = random.choice(uas)
	headers = {
		"User-Agent": ua,
		"Referer": refer,
		"Host": host
	}
	return headers

def get_html(url, ret_type="text", timeout=50, encoding="utf-8"):
	headers = get_headers()
	res = requests.get(url, headers=headers, timeout=timeout)
	res.encoding = encoding
	# print(res.status_code)
	# print(res.text)
	if ret_type == "text":
		return res.text
	elif ret_type == "image":
		return res.content
	elif ret_type == "json":
		return res.json()

好了，嗯嗯嗯，就是这样…尝试抓取一下自己的文章列表吧！！修改这里

    # 博主首页链接
    blog_url = 'https://blog.csdn.net/博主名称'
    # 自行查看，博主首页文章的页数
    pages = 2

其他，在保存为csv的时候，再通过office-excel打开会乱码

别急~~，先用记事本打开，另存为编码改为ANSI 就可以了。

qq_39454665

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
爬取CSDN博主文章列表，练习

跟着擦哥、擦姐的系列文章，一步步向前走。又好久没去读文章了…写的真是太好了~这就是个简单的爬取工作，增加一下原创文章数量+1访问博主首页爬取文章列表，保存在Excel里，且文章标题直接关联了超链接就是简单的把图里的内容，转换成了 excel，哈哈。那，看图。获取擦姐的博客首页文章地址urlhttps://dream.blog.csdn.net/article/list/1（第一页可有可无‘/article/list/1’）https://dream.blog.csdn.net
复制链接

扫一扫