from lxml import etree
text ='''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//*')print(result)# [<Element html at 0x2893a48>, <Element body at 0x28939c8>, <Element div at 0x2893988>, <Element ul at 0x2893a88>, <Element li at 0x2893ac8>, <Element a at 0x2893b48>, <Element li at 0x2893b88>, <Element a at 0x2893bc8>, <Element li at 0x2893c08>, <Element a at 0x2893b08>, <Element li at 0x2893c48>, <Element a at 0x2893c88>, <Element li at 0x2893cc8>, <Element a at 0x2893d08>]
result = html.xpath('//li')print(result)# [<Element li at 0x2894ac8>, <Element li at 0x2894a88>, <Element li at 0x2894b88>, <Element li at 0x2894bc8>, <Element li at 0x2894c08>]print(result[0])# <Element li at 0x2894ac8>
result = html.xpath('//li/a')print(result)# [<Element a at 0x2894a48>, <Element a at 0x2894a08>, <Element a at 0x2894b08>, <Element a at 0x2894b48>, <Element a at 0x2894b88>]
result = html.xpath('//a[@href="link4.html"]/../@class')print(result)# ['item-1']
result = html.xpath('//li[@class="item-0"]')print(result)# [<Element li at 0x28939c8>, <Element li at 0x2893988>]
result = html.xpath('//li[@class="item-0"]/a/text()')print(result)# ['first item', 'fifth item']
result = html.xpath('//li[@class="item-0"]//text()')print(result)# ['first item', 'fifth item', '\n ']
result = html.xpath('//li/a/@href')print(result)# ['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
from lxml import etree
text ='''
<li class="li li-first"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class,"li")]/a/text()')print(result)# ['first item']
5、多属性匹配
.xpath(’//li[contains(@class,“li”) and @name=“item”]/a/text()’):根据多个属性确定一个节点,用and连接
from lxml import etree
text ='''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class,"li") and @name="item"]/a/text()')print(result)# ['first item']
from lxml import etree
text ='''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/a/text()')print(result)# ['first item']
result = html.xpath('//li[last()]/a/text()')print(result)# ['fifth item']
result = html.xpath('//li[position()<3]/a/text()')print(result)# ['first item', 'second item']
result = html.xpath('//li[last()-2]/a/text()')print(result)# ['third item']
7、节点轴选择
XPath提供了很多节点轴选择方法,包括获取子元素,兄弟元素,父元素,祖元素等
http://www.w3school.com.cn/xpath/xpath_axes.asp
from lxml import etree
text ='''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/ancestor::*')print(result)# 调用ancestor轴,获取所有祖先节点
result = html.xpath('//li[1]/ancestor::div')print(result)# 获取只有div的祖先节点
result = html.xpath('//li[1]/attribute::*')print(result)# 调用attribute轴,获取所有属性值
result = html.xpath('//li[1]/child::a[@href="link1.html"]')print(result)# 调用child轴获取所有直接子节点
result = html.xpath('//li[1]/descendant::span')print(result)# 调用descendant轴,获取所有子孙节点
result = html.xpath('//li[1]/following::*[2]')print(result)# 调用following轴,获取当前节点之后的所有节点,这里我们虽然使用的是*匹配,但又加了索引选择,所以只获取了第二个后续节点
result = html.xpath('//li[1]/following-sibling::*')print(result)# 调用following-sibling轴,获取当前节点之后的所有统计节点# 运行结果# [<Element html at 0x2893b88>, <Element body at 0x2893b08>, <Element div at 0x2893ac8>, <Element ul at 0x2893bc8>]# [<Element div at 0x2893ac8>]# ['item-0']# [<Element a at 0x2893bc8>]# [<Element span at 0x2893ac8>]# [<Element a at 0x2893bc8>]# [<Element li at 0x2893b08>, <Element li at 0x2893c08>, <Element li at 0x2893c48>, <Element li at 0x2893c88>]