# XPath常见规则
# nodename 选取此节点的所有子节点
# / 从当前节点选取直接子节点
# // 从当前节点选取子孙节点
# . 选取当前节点
# .. 选取当前节点的父节点
# @ 选取属性
# tostring 用法 结果为bytes类型,修正HTML代码
from lxml import etree
text = '''
<div>
<ul>
<li class="item-O"><a href="link1.html">first item</a><li>
<li class=”item-1”><a href="link2.html">second item</a><li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a><li>
<li class="item-0"><href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result= etree.tostring(html)
print(result.decode('utf-8'))
# test.html
# <html><body><div>
# <ul>
# <li class="item-O"><a href="link1.html">first item</a></li><li>
# </li><li class="”item-1”"><a href="link2.html">second item</a></li><li>
# </li><li class="item-inactive"><a href="link3.html">third item</a></li>
# <li class="item-1"><a href="link4.html">fourth item</a></li><li>
# </li><li class="item-0"><href>fifth item
# </href></li></ul>
# </div>
# </body></html>
# HTMLParser() 用法
from lxml import etree
html = etree.parse ('./test.html', etree.HTMLParser())
result= etree.tostring(html)
print(result.decode('utf-8'))
# 所有节点
from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath("//*")
print(result)
# [<Element html at 0x2a083f5bf40>, <Element body at 0x2a09d6881c0>, <Element div at 0x2a09d688f00]
# 获取所有li标签
from lxml import etree
html = etree.parse ("./test.html", etree.HTMLParser())
result = html.xpath ('//li')
print(result)
print(result[0])
from lxml import etree
html = etree.parse('./test.html', etree .HTMLParser())
result = html. xpath ('//li/a')
print(result)
# 父节点
from lxml import etree
html = etree.parse ('./test.html', etree.HTMLParser())
result = html.xpath ('//a[@href="link4.html"]/../@class')
print(result)
# ['item-1']
# parent:: 来获取父节点,
from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
print(result)
# # ['item-1']
# 属性匹配
from lxml import etree
html= etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]')
print(result)
# [<Element li at 0x2a8cc315100>]
# 文本获取
from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]/text()')
print(result)
# 属性获取
from lxml import etree
html = etree.parse('./test.html', etree.HTMLParser())
result = html.xpath ('//li/a/@href')
print(result)
# ['link1.html', 'link2.html', 'link3.html', 'link4.html']
# 属性多值匹配
from lxml import etree
text = '''
<li class="li li-first"><a href="link.html">first item</a><li>
'''
html = etree.HTML(text)
result = html.xpath('//li[@class="li"]/a/text()')
print(result)
# []
# contains()用法
from lxml import etree
text = '''
<li class="li li-first"><a href="link.html">first item</a><li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class,"li")]/a/text()')
print(result)
# ['first item']
# 多属性匹配
from lxml import etree
text = '''
<li class="li li-first" name="item"><a href="link.html">first item</a></li>
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class,"li") and @name="item"]/a/text()')
print(result)
# ['first item']
# 按序选择
from lxml import etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/a/text()')
print(result)
result = html.xpath('//li[last()]/a/text()')
print(result)
result = html.xpath('//li[position()<3]/a/text()')
print(result)
result = html.xpath('//li[last()-2]/a/text()')
print(result)
# ['first item']
# ['fifth item']
# ['first item', 'second item']
# ['third item']
# 节点轴选择
from lxml import etree
text = '''
<div>
<ul>
<li class="item-0"><a href="link1.html">first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul>
</div>
'''
html = etree.HTML(text)
result = html.xpath("//li[1]/ancestor::*")
print(result)
result = html.xpath ("//li[1]/ancestor::div")
print(result)
result = html.xpath ("//li[1]/attribute::*")
print(result)
result= html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
result = html.xpath ("//li[1]/descendant::span")
print(result)
result = html.xpath("//li[1]/following::*[2]")
print(result)
result = html.xpath("//li[1]/following-sibling::*")
print(result)
# [<Element html at 0x14fab77f6c0>, <Element body at 0x14fc4b7e7c0>, <Element div at 0x14fc4b7e780>, <Element ul at 0x14fc4b98400>]
# [<Element div at 0x14fc4b7e780>]
# ['item-0']
# [<Element a at 0x14fc4b7f200>]
# []
# [<Element a at 0x14fc4b7e7c0>]
# [<Element li at 0x14fc4b7e980>, <Element li at 0x14fc4b98400>, <Element li at 0x14fc4b98600>, <Element li at 0x14fc4b985c0>]
一. XPath
常用规则
nodename 选取当前节点的所有子节点
/ 从当前节点选择直接子节点
// 从当前节点选择子孙节点
. 选择当前节点
.. 选择当前节点的父节点
@ 选择属性
代码如下:
from lxml import etree
text='***'
html=etree.HTML(text) # 初始化
result=etree.tostring(html) # tostring()方法补全完整的html(标准节点body,html)
print(result.decode('utf-8'))
# 等同于直接解析
from lxml import etree
html=etree.parse('./test.html',etree.HTMLParser())
result=etree.tostring(html)
# 所有节点 result = html.xpath('//*')
# 所有li节点 result =html.xpath('//li')
# 子节点 result=html.xpath('//li/a')
其父节点的class属性 html.xpath('//li/a/../@class')
等同于:html.xpath('//li/a/parent::*/@class')
# 属性匹配 html.xpath('/li[@class="***"]')
# 属性获取 html.xpath('/li/a/@href')
# 文本匹配 html.xpath('/li[@class="***"]/text()')
print(result.decode('utf-8'))
# 多值匹配 contains
text="""
<li class='li li-first'>***</li>
"""
html.xpath('//li[contains(@class,"li")]/a/text()') # 多值匹配
html.xpath('//li[contains(@class,"li") and @name='item']/a/text()') # 多属性匹配
# 按序选择
html.xpath('//li[1]/a/text()') # 第一个值
html.xpath('//li[last()]/a/text()') # 最后一个值
html.xpath('//li[position()<3]/a/text()') # 位置值小于3,即1或2
html.xpath('//li[last()-2]/a/text()') # 倒数第三个
代码摘抄之《Python 3网络爬虫开发实战》