0.xml基础
每一个element可以看成:
texttail
tag,即标签,用于标识该元素
attributes,即属性,元素具有的属性,可以为空
text,元素包含的文本。可以为字符串、子元素的组合
tail,尾字符串,用于控制输出文件时的格式,一般为:\n\t或者\n,但希望输出缩进格式时,可以将其赋值。
2
2008
解析为:
tag1:rank
attributes:update
text:2
tail:\n\t
tag2:year
attributes:
text:2008
tail:\n\t
1.countries.xml文件
2
2008
141100
5
2011
59900
69
2011
13600
2.python代码
使用python标准库xml中的etree处理。
2.1 查找和遍历
查找可以使用XPath函数族find*来做。
root = ET.parse('countries.xml')
country_node = root.getiterator('country')
for node in country_node:
print("=" * 30)
if 'name' in node.attrib:
print(node.tag, '=> name: ', node.attrib['name'])
else:
print(node.tag)
print("-" * 30)
country_node_children = node.getchildren()
for country_node_child in country_node_children:
print(country_node_child.tag, '=> ', country_node_child.text)
print("~" * 30)
print("Find all countries which have neighbors on the west as ['direction']=='W'")
countries = root.findall("./country/neighbor[@direction='W']/..")
for country in countries:
print("-" * 30)
if 'name' in country.attrib:
print(country.tag, '=> name: ', country.attrib['name'])
else:
print(country.tag)
print("-" * 30)
country_infos = country.getchildren()
for country_info in country_infos:
if country_info.attrib and country_info.text:
print(country_info.tag, '=> ', country_info.text, country_info.attrib)
elif country_info.attrib:
print(country_info.tag, '=> ', country_info.attrib)
elif country_info.text:
print(country_info.tag, '=> ', country_info.text)
else:
print(country_info.tag)
2.2 删除
使用要删除节点的父节点来删除。不能用iterator来删除,iterator只是删除迭代子中的引用。
print("~" * 30)
print("Remove all countries has neighbor on west as ['direction']=='W'")
find_countries = tree.findall("./country/neighbor[@direction='W']/..")
for country in find_countries:
print("Remove...")
show_country(country)
# Dont using all_countries iterator to remove
# which only remove reference from iterator not from elementTree
root_data.remove(country)
2.3 添加
将节点加入到父节点来实现节点加入。
print("~" * 30)
print("Add one country...")
one_country = ET.Element("country", {"name": "Panama"})
one_country_rank = ET.SubElement(one_country, "rank", {"updated": "yes"})
one_country_rank.text = "69"
one_country_year = ET.SubElement(one_country, "year")
one_country_year.text = "2011"
one_country_gdppc = ET.SubElement(one_country, "gdppc")
one_country_gdppc.text = "13600"
one_country_neighbor = ET.SubElement(one_country, "neighbor", {"name": "Costa Rica", "direction": "W"})
one_country_neighbor = ET.SubElement(one_country, "neighbor", {"name": "Colombia", "direction": "E"})
ET.dump(one_country)
root_data.append(one_country)
2.4 输出到文件
输入文件已经Beautify过了,所以读取后每一个元素的tail都包含控制符\n\t或者\n+空格。新加入节点的tail为空,默认输出时不会Beautify处理。
2.4.1 程序控制每一个元素的缩进,将每一个元素的tail设置为合适的空格或者tab数量。代码如下:
print("Add one country...")
one_country = ET.Element("country", {"name": "Panama"})
one_country.tail = "\n"
one_country.text = "asdasd\n\t\t"
one_country_rank = ET.SubElement(one_country, "rank", {"updated": "yes"})
one_country_rank.text = "69"
one_country_rank.tail = "\n\t\t"
one_country_year = ET.SubElement(one_country, "year")
one_country_year.text = "2011"
one_country_year.tail = "\n\t\t"
one_country_gdppc = ET.SubElement(one_country, "gdppc")
one_country_gdppc.text = "13600"
one_country_gdppc.tail = "\n\t\t"
one_country_neighbor = ET.SubElement(one_country, "neighbor", {"name": "Costa Rica", "direction": "W"})
one_country_neighbor.tail = "\n\t\t"
one_country_neighbor = ET.SubElement(one_country, "neighbor", {"name": "Colombia", "direction": "E"})
one_country_neighbor.tail = "\n\t"
ET.dump(one_country)
root_data.append(one_country)
ET.ElementTree(root_data).write("new_countries.xml", None, True, None, "xml")
2.4.2 首先遍历所有元素,将tail设置为“”,然后在输出时控。
代码如下:
def strip_all_tail(root_node):
if root_node:
if root_node.text:
root_node.text = root_node.text.strip()
if root_node.tail:
root_node.tail = root_node.tail.strip()
# 遍历每个子节点
children_node = list(root_node)
for child in children_node:
if child.text:
child.text = child.text.strip()
if child.tail:
child.tail = child.tail.strip()
strip_all_tail(child)
xmlstr = ET.tostring(root_data).decode()
newxml = md.parseString(xmlstr)
strip_all_tail(root_data)
with open('new_countries.xml', 'w') as outfile:
outfile.write(newxml.toprettyxml(indent='\t', newl='\n'))
2.5 完整代码
try:
import xml.etree.cElementTree as ET
except ImportError:
import xml.etree.ElementTree as ET
def show_country(acountry):
print("=" * 30)
if 'name' in acountry.attrib:
print(acountry.tag, '=> name: ', acountry.attrib['name'])
else:
print(acountry.tag)
print("-" * 30)
_country_infos = acountry.getchildren()
for _country_info in _country_infos:
if _country_info.attrib and _country_info.text:
print(_country_info.tag, '=> ', _country_info.text, _country_info.attrib)
elif _country_info.attrib:
print(_country_info.tag, '=> ', _country_info.attrib)
elif _country_info.text:
print(_country_info.tag, '=> ', _country_info.text)
else:
print(_country_info.tag)
def show_countries(country_list):
for _country in country_list:
show_country(_country)
tree = ET.parse('countries.xml')
root_data = tree.getroot() # point to xml node:
all_countries = tree.getiterator('country')
show_countries(all_countries)
print("~" * 30)
print("Find all countries has neighbor on west as ['direction']=='W'")
find_countries = tree.findall("./country/neighbor[@direction='W']/..")
show_countries(find_countries)
print("~" * 30)
print("Remove all countries has neighbor on west as ['direction']=='W'")
find_countries = tree.findall("./country/neighbor[@direction='W']/..")
for country in find_countries:
print("Remove...")
show_country(country)
# Dont using all_countries iterator to remove
# which only remove reference from iterator not from elementTree
root_data.remove(country)
print("~" * 30)
print("Remain countries...")
countries = tree.getiterator('country')
show_countries(countries)
print("~" * 30)
print("Add one country...")
one_country = ET.Element("country", {"name": "Panama"})
one_country_rank = ET.SubElement(one_country, "rank", {"updated": "yes"})
one_country_rank.text = "69"
one_country_year = ET.SubElement(one_country, "year")
one_country_year.text = "2011"
one_country_gdppc = ET.SubElement(one_country, "gdppc")
one_country_gdppc.text = "13600"
one_country_neighbor = ET.SubElement(one_country, "neighbor", {"name": "Costa Rica", "direction": "W"})
one_country_neighbor = ET.SubElement(one_country, "neighbor", {"name": "Colombia", "direction": "E"})
ET.dump(one_country)
root_data.append(one_country)
countries = tree.getiterator('country')
show_countries(countries)