lxml是python的一个解析库,支持HTML、XML、XPath等解析方式,官方地址:https://lxml.de/
XPath教程:http://www.w3school.com.cn/xpath/index.asp
案例
import urllib
import lxml.etree
class csdn_blog():
def __init__(self):
self.url = "https://blog.csdn.net/tmaczt/article/list/{}?orderby=UpdateTime"
def get_html(self, url):
html = "";
try:
response = urllib.request.urlopen(url)
html = response.read()
except Exception as err:
print(err)
return html
def test_xpath(self):
bagin_page = int(input("请输入起始页:"))
end_page = int(input("请输入结束页:"))
for page_no in range(bagin_page, end_page + 1):
url_link = self.url.format(str(page_no))
xml_content = lxml.etree.HTML(self.get_html(url_link))
href_content = xml_content.xpath('//div[@class="article-list"]/div/p/a/@href')
for item in href_content:
url = item.strip();
html = self.get_html(url);
if (html == ""):
continue;
itemcontent = lxml.etree.HTML(html)
title_list = itemcontent.xpath('//div[@class="article-title-box"]/h1');
span_list = itemcontent.xpath('//div[@class="article-bar-top"]/span');
title = title_list[0].text;
time = span_list[0].text;
read_count = span_list[1].text;
print("标题:", title)
print("发布时间:", time)
print(read_count)
if __name__ == "__main__":
blogs = csdn_blog()
blogs.test_xpath()
**解析结果**