python 监控CSDN博客访问量和排名写入excel

最新推荐文章于 2021-03-02 13:11:42 发布

Feng_MuJin

最新推荐文章于 2021-03-02 13:11:42 发布

阅读量868

点赞数 2

分类专栏：爬虫 Python 文章标签： python

本文链接：https://blog.csdn.net/qq_42029527/article/details/84375397

版权

Python 同时被 2 个专栏收录

45 篇文章 0 订阅

订阅专栏

爬虫

17 篇文章 0 订阅

订阅专栏

# coding=gbk

import requests
import datetime
import os
import xlwt
import xlrd
from xlutils.copy import copy
from bs4 import BeautifulSoup


# 获取HTML
def get_page_source(url):
    try:
        r = requests.get(url, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "failed"


def main():
    # url = input("请输入博客主页网址， 比如：https://blog.csdn.net/qq874455953\n")
    url = "https://blog.csdn.net/qq_42029527"
    allArticleInfo = []

    for i in range(1, 5):
        # 因为可能会有多个页面， 这里假设不超过五页，所以需要循环解析多个页面， 对应的url会改变
        urlNow = url + "/article/list/" + str(i)
        # 之所以加上“?orderby=UpdateTime” 是因为需要每次的文章顺序相同， 所以按照更新时间来排序
        html = get_page_source(urlNow + '?orderby=UpdateTime')
        # 解析HTML  使用BeautifulSoup

        soup = BeautifulSoup(html, 'html.parser')
        articleList = soup.find_all('div', attrs={'class': 'article-item-box csdn-tracking-statistics'})
        readBar = soup.find_all('div', attrs={'class': 'info-box d-flex align-content-center'})
        # 获取文章列数
        articleListLen = len(articleList)
        # 把数据放入到allArticleInfo数组里，  数组的每一项是一个（<文章名字>,<文章阅读>）的元祖
        for i in range(articleListLen):
            articleName = articleList[i].h4.a.contents[2]
            articleReadCount = readBar[i].contents[3].span.string.split("：")[1]
            allArticleInfo.append((articleName, articleReadCount))
    # 调用函数写入到excel表格里
    writeToExcel(allArticleInfo)


def writeToExcel(allArticleInfo):
    # 如果excel表格不存在，则创建
    if not os.path.exists("ReadRecord.xls"):
        workbook = xlwt.Workbook()
        worksheet = workbook.add_sheet('My ReadRecord')
        worksheet.write(0, 0, "1")
        workbook.save("ReadRecord.xls")

    # 打开excel， 为rexcel
    rexcel = xlrd.open_workbook("ReadRecord.xls")
    # 拷贝原来的rexcel， 变成excel
    excel = copy(rexcel)
    # 得到工作表
    worksheet = excel.get_sheet(0)
    # 得到列数
    read_time = rexcel.sheets()[0].ncols

    # 得到当前日期
    nowTime = str(datetime.datetime.now().strftime('%Y-%m-%d'))

    # 写好第一列的说明
    worksheet.write(0, 0, "文章名字")
    worksheet.write(0, read_time, nowTime)

    infoLength = len(allArticleInfo)

    # 把数据写入到表格里， 之所以反着写是希望新写的文章在表格的下一列添加， 所以要先写老文章
    for i in range(infoLength):
        worksheet.write(infoLength - i, 0, allArticleInfo[i][0])
        worksheet.write(infoLength - i, read_time, allArticleInfo[i][1])

    # 再次解析主页 得到总访问量 和 总排名
    html = get_page_source("https://blog.csdn.net/qq874455953")
    soup = BeautifulSoup(html, 'html.parser')
    totalReadBar = soup.find('div', attrs={'class': 'grade-box clearfix'})

    # 保存表格
    # 得到总访问量
    totalReadCount = totalReadBar.contents[3].dd.attrs['title']

    # 得到总排名
    totalReadRank = totalReadBar.contents[7].attrs['title']
    
    worksheet.write(infoLength + 1, read_time, totalReadCount)
    worksheet.write(infoLength + 2, read_time, totalReadRank)
    worksheet.write(infoLength + 1, 0, "总访问量")
    worksheet.write(infoLength + 2, 0, "总排名")

    excel.save('ReadRecord.xls')


if __name__ == "__main__":
    main()