Python3 获取CSDN博客所有文章标题及阅读数

#coding=utf-8
import re
import requests
from bs4 import BeautifulSoup
from prettytable import PrettyTable

def getHtml(url):  
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'}  
    page = requests.get(url,headers = headers)
    html =page.text
    return html

if __name__=='__main__':
    ltitle = []
    lread = []
    lcommand = []
    table = PrettyTable(['NO.','文章标题','阅读数','评论数'])
    for page in range(1,18):
        url = "https://blog.csdn.net/qq523176585/article/list/{}".format(page)
        Html = getHtml(url)
        Soup = BeautifulSoup(Html,"html.parser")
        titles = Soup.select('h4 > a[href]')
        for title in titles[1:]:
            title = title.text.split('\n')[-1]
            title = title.strip()
            ltitle.append(title)
        reg = r'\d+'
        readnums = Soup.select('div > p:nth-of-type(2)')
        for readnum in readnums[1:]:
            readnum = readnum.text
            readnum = re.findall(reg,readnum)[0]
            lread.append(int(readnum))
        
        commands = Soup.select('div > p:nth-of-type(3)')
        for command in commands[1:]:
            command = command.text
            command = re.findall(reg,command)[0]
            lcommand.append(int(command))
    for i in range(len(ltitle)):
        table.add_row([i+1,ltitle[i],lread[i],lcommand[i]])
    print (table)
    print ("总阅读数:{}\n总评论数:{}".format(sum(lread),sum(lcommand)))

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值