-
from lxml import etree
-
s="""
-
<?xml version="1.0" encoding="UTF-8"?>
-
<bookstore>
-
<book>
-
<title lang="eng">Harry Potter</title>
-
<price>29.99</price>
-
</book>
-
<book>
-
<title lang="eng">Learning XML</title>
-
<price>39.95</price>
-
</book>
-
</bookstore>
-
"""
-
#将变量读取为html对象,自动添加<html><body>... ...</body></html>
-
html=etree.HTML(s)
-
#解析数据 /根节点
-
data1=html.xpath('/html')
-
for i in data1:
-
print(i)
-
#//后面根子孙节点,后面的节点可以在任意位置
-
data2=html.xpath('//title')
-
for i in data2:
-
print(i)
-
#写具体路径
-
data3=html.xpath('/html/body/bookstore/book/price')
-
for i in data3:
-
print(i.text)
-
#循环某个属性
-
data4=html.xpath('/html/body/bookstore/book/title/@lang')
-
for i in data4:
-
print(i)
-
#代表当前节点,..代表父节点
-
data5=html.xpath('/html/body/bookstore/book/title')
-
for i in data5:
-
a=i.xpath('./@lang')
-
print(a)
-
#读取某一个节点
-
data6=html.xpath('/html/body/bookstore/book')
-
for i in data6:
-
ti_1=i.xpath('./title[1]')
-
print(ti_1)
-
ti_last=i.xpath('./title[last()]')
-
print(ti_last)
-
#带有某些属性的标签
-
data7=html.xpath('//book/title[@lang="eng"]')
-
for i in data7:
-
print(i.text)
-
#*位置节点
-
data8=html.xpath('//book/*')
-
for i in data8:
-
print(i)
-
if hasattr(i,'text'):
-
print(i.text)
-
else:
-
print(i)
-
from lxml import etree
-
doc='''
-
<div>
-
<ul>
-
<li class="item-0"><a href="link1.html">first item</a></li>
-
<li class="item-1"><a href="link2.html">second item</a></li>
-
<li class="item-inactive"><a href="link3.html">third item</a></li>
-
<li class="item-1"><a href="link4.html">fourth item</a></li>
-
<li class="item-0"><a href="link5.html">fifth item</a> </li>
-
</ul>
-
</div>
-
'''
-
#将字符串解析为html对象,补全html,body
-
html=etree.HTML(doc)
-
#将a标签的文本直接获取//text()
-
data2=html.xpath('//li/a/text()')
-
print(data2)
-
#解析html文件
-
parser=etree.HTMLParser(encoding='utf-8')
-
html_file=etree.parse('123.htm',parser=parser)
-
#匹配任意节点
-
data3=html_file.xpath('//*')
-
for i in data3:
-
if hasattr(i,'text'):
-
print(i.text)
-
#解析带属性的a标签的text文本
-
data4=html_file.xpath('//li/a[@href="link2.html"]/text()')
-
print(data4)
-
#解析属性的值
-
data5=html_file.xpath('//li/a/@href')
-
print(data5)
-
#谓语
-
data6=html_file.xpath('//li[last()]/a/text()')
-
print(data6)