from lxml import etree
text = ''' <div> <ul>
<li class="item-1"><a>first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul> </div> '''
html = etree.HTML(text)
ret = html.xpath("//li")
print(ret) # [<Element li at 0x2d90f08>, <Element li at 0x2d90ee0>, <Element li at 0x2d90eb8>, <Element li at 0x2d90e90>, <Element li at 0x2d90e68>]
for i in ret:
ret2 = i.xpath("//@class")
print(ret2)
结果:
[<Element li at 0x2d90f08>, <Element li at 0x2d90ee0>, <Element li at 0x2d90eb8>, <Element li at 0x2d90e90>, <Element li at 0x2d90e68>]
['item-1', 'item-1', 'item-inactive', 'item-1', 'item-0']
['item-1', 'item-1', 'item-inactive', 'item-1', 'item-0']
['item-1', 'item-1', 'item-inactive', 'item-1', 'item-0']
['item-1', 'item-1', 'item-inactive', 'item-1', 'item-0']
['item-1', 'item-1', 'item-inactive', 'item-1', 'item-0']
理想结果:
[<Element li at 0x3230f30>, <Element li at 0x3230f08>, <Element li at 0x3230ee0>, <Element li at 0x3230eb8>, <Element li at 0x3230e90>]
['item-1']
['item-1']
['item-inactive']
['item-1']
['item-0']
from lxml import etree
text = ''' <div> <ul>
<li class="item-1"><a>first item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-inactive"><a href="link3.html">third item</a></li>
<li class="item-1"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a>
</ul> </div> '''
html = etree.HTML(text)
ret = html.xpath("//li")
print(ret) # [<Element li at 0x2d90f08>, <Element li at 0x2d90ee0>, <Element li at 0x2d90eb8>, <Element li at 0x2d90e90>, <Element li at 0x2d90e68>]
for i in ret:
ret2 = i.xpath(".//@class") # 注意在//前面加了一个“.”
print(ret2)