XPath实例
XPath可以修复残缺源码
from lxml improt etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">first item</a></li>
<li class="item-0"><a href="link5.html">first item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))
选取节点
from lxml improt etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">first item</a></li>
<li class="item-0"><a href="link5.html">first item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//')
for i in result:
print(i)
使用@过滤选择节点
from lxml improt etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">first item</a></li>
<li class="item-0"><a href="link5.html">first item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li[@class="item-0"]')
for i in result:
print(i)
获取文本
from lxml improt etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">first item</a></li>
<li class="item-0"><a href="link5.html">first item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li[@class="item-0"]/a/text()')
for i in result:
print(i)
属性获取@
from lxml improt etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html"><span>first item</span></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">first item</a></li>
<li class="item-0"><a href="link5.html">first item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li/a/@href')
for i in result:
print(i)
属性多值匹配
from lxml improt etree
text = '''
<div>
<ul>
<li class="item-0 li li-first" name="item"><a href="link1.html">first item></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">first item</a></li>
<li class="item-0"><a href="link5.html">first item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class,"li") and @name="item"]/a/text')
for i in result:
print(i)
按序选择
from lxml improt etree
text = '''
<div>
<ul>
<li class="item-0 li li-first" name="item"><a href="link1.html">first item></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">first item</a></li>
<li class="item-0"><a href="link5.html">first item</a>
</ul>
</div>
'''
html = etree.HTML(text)
for i in result:
print(i)
轴选择
from lxml improt etree
text = '''
<div>
<ul>
<li class="item-0 li li-first" name="item"><a href="link1.html">first item></a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">first item</a></li>
<li class="item-0"><a href="link5.html">first item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('li[1]/ancestor::*')
for i in result:
print(i)