#coding=utf-8
import re
import requests
from bs4 import BeautifulSoup
from prettytable import PrettyTable
def getHtml(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0'}
page = requests.get(url,headers = headers)
html =page.text
return html
if __name__=='__main__':
ltitle = []
lread = []
lcommand = []
table = PrettyTable(['NO.','文章标题','阅读数','评论数'])
for page in range(1,18):
url = "https://blog.csdn.net/qq523176585/article/list/{}".format(page)
Html = getHtml(url)
Soup = BeautifulSoup(Html,"html.parser")
titles = Soup.select('h4 > a[href]')
for title in titles[1:]:
title = title.text.split('\n')[-1]
title = title.strip()
ltitle.append(title)
reg = r'\d+'
readnums = Soup.select('div > p:nth-of-type(2)')
for readnum in readnums[1:]:
readnum = readnum.text
readnum = re.findall(reg,readnum)[0]
lread.append(int(readnum))
commands = Soup.select('div > p:nth-of-type(3)')
for command in commands[1:]:
command = command.text
command = re.findall(reg,command)[0]
lcommand.append(int(command))
for i in range(len(ltitle)):
table.add_row([i+1,ltitle[i],lread[i],lcommand[i]])
print (table)
print ("总阅读数:{}\n总评论数:{}".format(sum(lread),sum(lcommand)))