安装xpath
pip3 install lxml
什么是XPath?
XPath即为XML路径语言(XML Path Language),它是一种用来在XML文档中查找信息的语言,可用来在 XML 文档中对元素和属性进行搜索,同样也适用于HTML。
什么是XML?
XML 指可扩展标记语言(EXtensible Markup Language)
XML 是一种标记语言,很类似 HTML
XML 的设计宗旨是传输数据,而非显示数据
XML 的标签需要我们自行定义。
XML 被设计为具有自我描述性。
XML 是 W3C 的推荐标准
区别
数据格式 | 描述 | 设计目标 |
---|---|---|
XML | Extensible Markup Language (可扩展标记语言) | 被设计为传输和存储数据,其焦点是数据的内容 |
HTML | HyperText Markup Language (超文本标记语言) | 显示数据以及如何更好显示数据 |
HTML DOM | Document Object Model for HTML (文档对象模型) | 通过 HTML DOM,可以访问所有的 HTML 元素,连同它们所包含的文本和属性。可以对其中的内容进行修改和删除,同时也可以创建新的元素 |
常用的路径表达式
表达式 | 作用 |
---|---|
nodename | 选取此节点的所有子节点 |
/ | 从当前节点选取直接子节点 |
// | 从当前节点选择文档中的节点,而不考虑它们的位置 |
. | 选取当前节点 |
… | 选取当前节点的父节点 |
@ | 选取属性 |
//title | //price |
使用
from lxml import etree
html = etree.HTML(html) #构造一个Xpath解析对像,并且自动修正HTML文本
案例
案例仅为基础xpath语法的用法
from lxml.html import etree
import requests
class CollegateRank(object):
def get_page_data(self,url):
response = self.send_request(url=url)
if response:
# print(response)
with open('page.html','w',encoding='gbk') as file:
file.write(response)
self.parse_page_data(response)
def parse_page_data(self, response):
"""使用xpath解析"""
etree_xpath = etree.HTML(response)
ranks = etree_xpath.xpath('//div[@class="scores_List"]/dl')
for dl in ranks:
school_info = {}
school_info['url'] = self.extract_first(dl.xpath('./dt/a/@href'))
school_info['cover'] = self.extract_first(dl.xpath('./dt/a/img/@src'))
school_info['name'] = self.extract_first(dl.xpath('./dt/strong/a/text()'))
school_info['address'] = self.extract_first(dl.xpath('./dd/ul/li[1]/text()'))
school_info['tese'] = ",".join(dl.xpath('./dd/ul/li[2]/span/text()'))
school_info['type'] = self.extract_first(dl.xpath('./dd/ul/li[3]/text()'))
school_info['belong'] = self.extract_first(dl.xpath('./dd/ul/li[4]/text()'))
school_info['level'] = self.extract_first(dl.xpath('./dd/ul/li[5]/text()'))
school_info['weburl'] = self.extract_first(dl.xpath('./dd/ul/li[6]/text()'))
# print(school_info)
d_url = school_info['url']
print(d_url)
self.get_page_detail(d_url,school_info)
def extract_first(self,data=None,default=None):
if len(data) > 0:
return data[0]
return default
def send_request(self,url,headers=None):
"""发起请求"""
headers = headers if headers else {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3730.400 QQBrowser/10.5.3805.400'}
response = requests.get(url=url,headers=headers)
if response.status_code == 200:
return response.text
if __name__ == '__main__':
url = 'http://college.gaokao.com/schlist/'
obj = CollegateRank()
obj.get_page_data(url)