CSDN个人博客列表爬取

在爬取完简书上的博客列表之后,又尝试爬取了CSDN个人主页的文章列表,程序实现的技术路线依旧是requesets+xpath
简书博客列表爬取:https://blog.csdn.net/fovever_/article/details/104172715
爬取的文章信息主要包括:文章标题、文章类型、文章链接、文章摘要、发布时间、阅读数、评论数。
废话不多说,先上代码:

import requests
from lxml import etree
import os
def getResponse(url):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
        'Connection': 'close'}
    try:
        r = requests.get(url, headers=header, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r
    except:
        return 0
def ResponseParse(r, alist):
    if r:
        dom = etree.HTML(r.text)
        articles_xpath = './/div[@class="container clearfix pt0"]/main/div[@class="article-list"]'
        articeles = dom.xpath(articles_xpath)
        title_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/h4/a/text()'
        type_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/h4/a/span/text()'
        href_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/p[@class="content"]/a/@href'
        abstract_xpath  = './/div[@class="article-item-box csdn-tracking-statistics"]/p[@class="content"]/a/text()'
        date_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[1]/span[@class="date"]/text()'
        read_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[3]/span[@class="read-num"]/span[@class="num"]/text()'
        comment_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[5]/span[@class="read-num"]/span[@class="num"]/text()'
        for article in articeles:
            title = article.xpath(title_xpath)
            type = article.xpath(type_xpath)
            href = article.xpath(href_xpath)
            abstract = article.xpath(abstract_xpath)
            date = article.xpath(date_xpath)
            read = article.xpath(read_xpath)
            comment = article.xpath(comment_xpath)
        for i in range(len(type)):
            alist.append([title[2*i + 1].strip().replace("\n", ""), type[i], href[i], abstract[i].strip().replace("\n", ""), date[i].strip().replace("\n", ""), read[i], comment[i]])
            print("文章标题:" + title[2*i + 1].strip().replace("\n", ""))
            print("文章类型:" + type[i])
            print("文章链接:" + href[i])
            print("文章摘要:" + abstract[i].strip().replace("\n", ""))
            print("发布时间:" + date[i].strip().replace("\n", ""))
            print("阅读数:" + read[i])
            print("评论数:" + comment[i])
            print("\n")
        return len(type)
    else:
        print("爬取失败!")

def Get_article_count(url):
    #//*[@id="asideProfile"]/div[2]/dl[1]/dd/a/span
    #/html/body/div[2]/div[1]/div[2]/ul/li[1]/a/label/span[2]
    #/html/body/div[2]/div[1]/div[2]/ul/li[1]/a/label/span[2]
    r = getResponse(url)
    print(r.url)
    # print(r.text)
    dom = etree.HTML(r.text)
    count_xpath1 = './/html/body/div[2]/div[1]/div[2]/ul/li[1]/a/label/span[2]/text()'
    count_xpath = './/div[@class="me_chanel_bar clearfix"]/ul/li/a[@class="tab_item tab_item_click"]/label/span[2]/text()'
    article_count = dom.xpath(count_xpath)
    return int(article_count[0].strip().replace("\n", ""))
def Get_author_name(url):
    #/html/body/div[2]/div[1]/div[1]/div[2]/p/text()
    r = getResponse(url)
    dom = etree.HTML(r.text)
    name_xpath = './/div[@class="me_wrap_lt clearfix"]/div[@class="lt_main clearfix"]/p[@class="lt_title"]/text()'
    name = dom.xpath(name_xpath)[2].strip().replace("\n", "")
    print("作者:", str(name))
    return name


def WriteWord(alist, name):
    save_dir = '文章列表'
    save_dir = os.path.join(os.getcwd(), save_dir)
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    save_name = os.path.join(save_dir, name)
    out = "文章标题:{0:{7}<10}\n文章类型:{1:{7}<10}\n文章链接:{2:{7}<20}\n文章摘要: {3:{7}<10}\n发布时间:{4:{7}<10}\n阅读数:{5:{7}<10}\n评论数:{6:{7}<10}\n"
    with open(save_name, 'w', encoding="utf-8") as f:
        for i in range(len(alist)):
            f.write(out.format(alist[i][0], alist[i][1], alist[i][2], alist[i][3], alist[i][4], alist[i][5], alist[i][6], chr(12288)))
            f.write("\n")
        f.close()
    print("数据成功写入:"+save_name)

def main():
	try:
		article_list = []
		user_name = "fovever_" #再次修改查询的用户名称https://blog.csdn.net/fovever_一般为用户主页最后一个下划线后的字符串
		url1 = "https://{0}.csdn.net/{1}"
		url = url1 + '/article/list/{2}'
		article_count = Get_article_count(url1.format("me", user_name))
		save_name = Get_author_name(url1.format("me", user_name)) + '.doc'
		if article_count % 40 == 0:
			spider_num = article_count /40
		else:
			spider_num = article_count / 40 + 1
		print(article_count)
		spider_article_count = 0
		for i in range(int(spider_num)):
			r = getResponse(url.format("blog", user_name, str(i + 1)))
			spider_article_count += ResponseParse(r, article_list)
		WriteWord(article_list, save_name)
		print("共爬取了:" + str(spider_article_count) + "篇博客!")
	except:
		print(user_name+"博客爬取失败!")
		continue

if __name__ == '__main__':
    main()

程序解析

接下来对程序中的几个重要方法进行解释。
getResponse(url)用于获取链接对应的响应,CSDN的反爬措施相对简单,只需要将headers伪装成浏览器即可。
程序的重头戏便是对获取的响应进行解析。该功能在ResponseParse(r, alist)中实现,其中参数alist用于存储文章信息。
在ResponseParse中主要是寻找文章列表信息的标签路径。
以自己的博客主页为例:https://blog.csdn.net/fovever_
f12查看源代码
在这里插入图片描述
由图可知:所有的文章目录在class为"article-list"的div标签中,所以我们首先使用xpath寻找该div标签对应的xpath我写的是:.//div[@class="container clearfix pt0"]/main/div[@class="article-list"]由此可以找到该div标签。
之后便是找到所有文章信息相对于这个标签的路径,根据下图

设置的所有信息的xpath为:

title_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/h4/a/text()'
type_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/h4/a/span/text()'
href_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/p[@class="content"]/a/@href'
abstract_xpath  = './/div[@class="article-item-box csdn-tracking-statistics"]/p[@class="content"]/a/text()'
date_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[1]/span[@class="date"]/text()'
read_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[3]/span[@class="read-num"]/span[@class="num"]/text()'
comment_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[5]/span[@class="read-num"]/span[@class="num"]/text()'

关于xpath的书写,我个人认为是仁者见仁,智者见智,每个人都可以根据自己的理解写出正确的xpath。
值得注意的是:按照我这样书写的xpath,获取得到的所有同类信息在一个列表中。最开始我是以title的长度来整合所有的信息,使得同一篇文章的所有信息构成一个列表。但是在输出title时发现有的title是空的,通过检查title列表,发现由于网页的结构所有的title列表中前一个元素是空而其后一个元素才是真正的title。所以最后程序以type的长度来整合信息,而title的取值是:2*i+1。最后将为一个包含伪证信息的列表添加到alist中。
如此这个程序便具备了简单的爬虫功能。
但是只能爬取一个页面的信息。与简书博客列表爬取相似,使用url的变化来对应翻页。以https://blog.csdn.net/ygdxt/article/list/1为例,可以发现通用的url形式为:https://blog.csdn.net/ygdxt/article/list/{?},只需要在{}中替换要爬取的页面即可,而且通过简单的观察发现CSDN中文章列表每一页中包含40篇文章信息。
所以首先希望获取作者对应的总的文章数,最开始是希望在https://blog.csdn.net/ygdxt/article/list/1中获取文章总数,但是
在这里插入图片描述
只有原创文章总数,但文章列表中包含了所有类型的文章。之后发现进入个人主页可以获得文章总数。
在这里插入图片描述
通过对比个人主页https://me.csdn.net/fovever_和博客列表主页https://blog.csdn.net/fovever_/article/list/1,个人主页与博客列表主页的前面部分知只是将me,改成了blog,所以我们将url写为:

url1 = "https://{0}.csdn.net/{1}"
url = url1 + '/article/list/{2}'

url1中的{0}用于区别是爬取博客主页还是个人主页,{1}中使用需要爬取的作者的ID(例如:fovever_),而url中的{2}则用于区别爬取的页数。
使用 Get_article_count(url)方法获取文章的总数,
在这里插入图片描述
如图文章总数位于span标签中,所以写出对应的xpath:.//div[@class="me_chanel_bar clearfix"]/ul/li/a[@class="tab_item tab_item_click"]/label/span[2]/text()
如此便能循环爬取个人的所有文章。
当我们想将爬取的信息保存到本地时,我们便需要在写一个保存数据的方法,为了唯一标识保存作者信息的文档,以作者主页的昵称命名文档,所以我们有写了Get_author_name(url)方法。如图
在这里插入图片描述
个人的昵称位于个人主页的SVg标签中,所以写出其xpath:name_xpath = './/div[@class="me_wrap_lt clearfix"]/div[@class="lt_main clearfix"]/p[@class="lt_title"]/text()'
再通过**WriteWord(alist, name)**将爬取的文章信息写入word中。
保存效果如图:
在这里插入图片描述

批量爬取多位博主的文章信息

为了批量爬取多位博主的文章信息,首先需要找到,在一个url界面中包含所有的博主信息,此处没有仔细的搜索。但是发现在https://bbs.csdn.net/total_rank包含许多博主的信息。
在这里插入图片描述
通过分析源代码
在这里插入图片描述
可以找到对应博主的个人空间的链接:https://me.csdn.net/net_lover我们需要的只是博主的id即net_lover,所以在获取了a标签的内容后,再对内容进行分割link.split('/')[-1] for link in href。href的xpath为:

href_xpath = './/div/div[@class="bbs_forums_wrap"]/div[@class="expert_wrap"]/div[@class="expert_box"]/ul[@class="expert_list"]/li/label[2]/a[@class="user_name"]/@href'

再在mian()方法中写一个for循环,便可以爬取所有博主的博客信息。但是由于有的博主没有博客,所以使用**try: except:**来处理异常
批量爬取结果如图:
在这里插入图片描述
最终的程序:

import requests
from lxml import etree
import os
def getResponse(url):
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:58.0) Gecko/20100101 Firefox/58.0',
        'Connection': 'close'}
    try:
        r = requests.get(url, headers=header, timeout = 30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r
    except:
        return 0
def ResponseParse(r, alist):
    if r:
        dom = etree.HTML(r.text)
        articles_xpath = './/div[@class="container clearfix pt0"]/main/div[@class="article-list"]'
        articeles = dom.xpath(articles_xpath)
        title_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/h4/a/text()'
        type_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/h4/a/span/text()'
        href_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/p[@class="content"]/a/@href'
        abstract_xpath  = './/div[@class="article-item-box csdn-tracking-statistics"]/p[@class="content"]/a/text()'
        date_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[1]/span[@class="date"]/text()'
        read_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[3]/span[@class="read-num"]/span[@class="num"]/text()'
        comment_xpath = './/div[@class="article-item-box csdn-tracking-statistics"]/div/p[5]/span[@class="read-num"]/span[@class="num"]/text()'
        for article in articeles:
            title = article.xpath(title_xpath)
            type = article.xpath(type_xpath)
            href = article.xpath(href_xpath)
            abstract = article.xpath(abstract_xpath)
            date = article.xpath(date_xpath)
            read = article.xpath(read_xpath)
            comment = article.xpath(comment_xpath)
            # title = [i for i in title if i != ' ']
            # print(title)
        for i in range(len(type)):
            alist.append([title[2*i + 1].strip().replace("\n", ""), type[i], href[i], abstract[i].strip().replace("\n", ""), date[i].strip().replace("\n", ""), read[i], comment[i]])
            print("文章标题:" + title[2*i + 1].strip().replace("\n", ""))
            print("文章类型:" + type[i])
            print("文章链接:" + href[i])
            print("文章摘要:" + abstract[i].strip().replace("\n", ""))
            print("发布时间:" + date[i].strip().replace("\n", ""))
            print("阅读数:" + read[i])
            print("评论数:" + comment[i])
            print("\n")
        return len(type)
        # print(r.text)
    else:
        print("爬取失败!")

def Get_article_count(url):
    #//*[@id="asideProfile"]/div[2]/dl[1]/dd/a/span
    #/html/body/div[2]/div[1]/div[2]/ul/li[1]/a/label/span[2]
    #/html/body/div[2]/div[1]/div[2]/ul/li[1]/a/label/span[2]
    r = getResponse(url)
    print(r.url)
    # print(r.text)
    dom = etree.HTML(r.text)
    count_xpath1 = './/html/body/div[2]/div[1]/div[2]/ul/li[1]/a/label/span[2]/text()'
    count_xpath = './/div[@class="me_chanel_bar clearfix"]/ul/li/a[@class="tab_item tab_item_click"]/label/span[2]/text()'
    article_count = dom.xpath(count_xpath)
    return int(article_count[0].strip().replace("\n", ""))

def Get_author_name(url):
    #/html/body/div[2]/div[1]/div[1]/div[2]/p/text()
    r = getResponse(url)
    dom = etree.HTML(r.text)
    name_xpath = './/div[@class="me_wrap_lt clearfix"]/div[@class="lt_main clearfix"]/p[@class="lt_title"]/text()'
    name = dom.xpath(name_xpath)[2].strip().replace("\n", "")
    print("作者:", str(name))
    return name


def WriteWord(alist, name):
    save_dir = '文章列表'
    save_dir = os.path.join(os.getcwd(), save_dir)
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    save_name = os.path.join(save_dir, name)
    out = "文章标题:{0:{7}<10}\n文章类型:{1:{7}<10}\n文章链接:{2:{7}<20}\n文章摘要: {3:{7}<10}\n发布时间:{4:{7}<10}\n阅读数:{5:{7}<10}\n评论数:{6:{7}<10}\n"
    with open(save_name, 'w', encoding="utf-8") as f:
        for i in range(len(alist)):
            f.write(out.format(alist[i][0], alist[i][1], alist[i][2], alist[i][3], alist[i][4], alist[i][5], alist[i][6], chr(12288)))
            f.write("\n")
        f.close()
    print("数据成功写入:"+save_name)

def main():
    user_url = 'https://bbs.csdn.net/total_rank'
    user_list = ["fovever_"]
    Get_User_list(user_url, user_list)
    for user_name in user_list:
        try:
            article_list = []
            # user_name = "fovever_" #再次修改查询的用户名称https://blog.csdn.net/fovever_一般为用户主页最后一个下划线后的字符串
            url1 = "https://{0}.csdn.net/{1}"
            url = url1 + '/article/list/{2}'
            article_count = Get_article_count(url1.format("me", user_name))
            save_name = Get_author_name(url1.format("me", user_name)) + '.doc'
            if article_count % 40 == 0:
                spider_num = article_count /40
            else:
                spider_num = article_count / 40 + 1
            print(article_count)
            spider_article_count = 0
            for i in range(int(spider_num)):
                r = getResponse(url.format("blog", user_name, str(i + 1)))
                spider_article_count += ResponseParse(r, article_list)
            WriteWord(article_list, save_name)
            # print(len(article_list))
            print("共爬取了:" + str(spider_article_count) + "篇博客!")
        except:
            print(user_name+"博客爬取失败!")
            continue

def Get_User_list(url, ulist):
    #/html/body/div[3]/div[2]/div/div/ul/li[1]/label[2]/a[2]
    r = getResponse(url)
    dom = etree.HTML(r.text)
    href_xpath = './/div/div[@class="bbs_forums_wrap"]/div[@class="expert_wrap"]/div[@class="expert_box"]/ul[@class="expert_list"]/li/label[2]/a[@class="user_name"]/@href'
    href = dom.xpath(href_xpath)
    user_name = [link.split('/')[-1] for link in href]
    for user in user_name:
        ulist.append(user)
    pass
if __name__ == '__main__':
    main()

该程序在2020年2月5日能够正常运行。
程序中或有问题,希望大家指教!

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Timer-419

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值