刚学了一点爬虫,拿自己的博客练练手ˋ( ° ▽、° )
提取标题、日期、内容、阅读数和评论数五个部分
import requests
from lxml import etree
def get_html(url):
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'}
html=requests.get(url,headers=headers)
html.encodeing=html.apparent_encoding
#print(html.text)
return html.text
def information_html(html):
html=etree.HTML(html)#构造一个xpath解析对象并对HTML文本进行修正
lis=html.xpath(".//div[@class='article-list']/div")#定位节点
print('共有',len(lis),'篇博客')
for div in lis:#提取匹配标签
title=div.xpath(".//a/text()")[1].strip()
content=div.xpath(".//a/text()")[2].strip()
data=div.xpath(".//p/span[@class='date']/text()")[0].strip()
read_number=div.xpath(".//p/span[@class='read-num']/span/text()")[0].strip()
comment_number=div.xpath(".//p/span[@class='read-num']/span/text()")[1].strip()
boke={'标题':title,'日期':data,'内容':content,'阅读数':read_number,'评论数':comment_number}
print(boke['标题'],'\n',boke['日期'],'\n',boke['内容'],'\n','阅读数:',boke['阅读数'],'评论数:',boke['评论数'],'\n')
if __name__=='__main__':
url='https://blog.csdn.net/qq_43878294'
html=get_html(url)
boke=information_html(html)