from lxml import etree
xml_string = """
<library>
<book>
<title lang="en">Harry Potter and the Philosopher's Stone</title>
<author>J. K. Rowling</author>
<pages>223</pages>
<price>7.99</price>
</book>
<book>
<title lang="en">The Hobbit</title>
<author>J. R. R. Tolkien</author>
<pages>304</pages>
<price>8.99</price>
</book>
<book>
<title lang="ch">土豆最好吃</title>
<author>J. R. R. Tolkien</author>
<pages>304</pages>
<price>8.99</price>
</book>
</library>
"""
# 解析 XML 字符串为 etree 对象
root = etree.fromstring(xml_string)
# 使用 XPath 表达式查找元素
# 查找所有的 book 元素
books = root.xpath('//book')
print(books)
for book in books:
# 对于每个 book 元素,查找其内部的 title, author, pages, price 元素
title = book.xpath('title/text()')[0] # 使用索引 [0] 因为我们知道每个 book 只有一个 title
author = book.xpath('author/text()')[0]
pages = book.xpath('pages/text()')[0]
price = book.xpath('price/text()')[0]
# 输出提取的信息
print(f"Title: {title}")
print(f"Author: {author}")
print(f"Pages: {pages}")
print(f"Price: {price}")
print("---")
# 还可以使用更复杂的 XPath 表达式来查找具有特定属性的元素
# 查找所有 title 元素,其 lang 属性为 "en"
english_titles = root.xpath('//title[@lang="en"]/text()')
for title in english_titles:
print(f"English Title: {title}")
输出:
[<Element book at 0x21badf5a980>, <Element book at 0x21badf5af40>, <Element book at 0x21badf5b080>]
Title: Harry Potter and the Philosopher's Stone
Author: J. K. Rowling
Pages: 223
Price: 7.99
---
Title: The Hobbit
Author: J. R. R. Tolkien
Pages: 304
Price: 8.99
---
Title: 土豆最好吃
Author: J. R. R. Tolkien
Pages: 304
Price: 8.99
---
English Title: Harry Potter and the Philosopher's Stone
English Title: The Hobbit