只要xml原文件标签完整、正确,哪怕是下图形式的内容,都可以获取标签的内容
代码总结:
from lxml import etree
import xml.etree.ElementTree as ET
class WoFullParser(object):
if __name__ == '__main__':
# 方法一
xml_file = open('WO2018172090A1.XML', mode='rb')
content = xml_file.read()
selector = etree.HTML(content)
p_list = selector.xpath('//p[@id="p0006"]/text()')
print(p_list)
# 方法二,包含3中遍历方法
tree = ET.ElementTree(file='WO2018172090A1.XML')
root = tree.getroot()
print(root)
# 2.1
for child_of_root in root[0]:
print(child_of_root.tag, child_of_root.attrib, child_of_root.text)
# 2.2
# 遍历所有子元素
for elem in tree.iter():
print(elem.tag, elem.attrib, elem.text)
# 遍历标签是“p”的元素
for elem in tree.iter(tag='p'):
print(elem.tag, elem.attrib, elem.text)
# 2.3
# 查找description标签下所有的p标签
for ele in tree.iterfind('description/p'):
print(ele.attrib, ele.text)
# 查找description标签下的p标签中包含属性id="p0276"的元素
for elem in tree.iterfind('description/p[@id="p0276"]'):
print(elem.tag, elem.attrib, elem.text)
参考:
https://www.cnblogs.com/deadwood-2016/p/8116863.html