xml文档
<?xml version="1.0"?>
<data>
<customer name="小明" >
<email>xm@gmail.com</email>
<phone>555-1234</phone>
</customer>
<customer name="小王" >
<email>xw@gmail.com</email>
</customer>
<customer name="小爱" >
<email>xa@gmail.com</email>
<phone>555-4567</phone>
</customer>
<customer name="大卫" >
<phone>555-6472</phone>
<address>
<street>Fifth Avenue</street>
</address>
</customer>
</data>
read and parse XMLfile
import xml.etree.cElementTree as et
parsedXML = et.parse('demo.xml')
node.attrib.get(),获取标签内部,k-v对应的键值
node.find()找到对应的子标签,(node.find(‘xxx’)).text:返回对应的内容
for node in parsedXML.getroot():
name = node.attrib.get('name')
email = node.find('email')
phone = node.find('phone')
street = node.find('address/street')
Full script
import xml.etree.cElementTree as et
import pandas as pd
def getvalueofnode(node):
""" return node text or None """
return node.text if node is not None else None
def main():
""" main """
parsed_xml = et.parse("demo.xml")
dfcols = ['name', 'email', 'phone', 'street']
df_xml = pd.DataFrame(columns=dfcols)
for node in parsed_xml.getroot():
name = node.attrib.get('name')
email = node.find('email')
phone = node.find('phone')
street = node.find('address/street')
df_xml = df_xml.append(
pd.Series([name, getvalueofnode(email), getvalueofnode(phone),
getvalueofnode(street)], index=dfcols),
ignore_index=True)
print (df_xml)
main()
name email phone street
0 小明 xw@gmail.com 555-1234 None
1 小王 xw@gmail.com None None
2 小爱 xa@gmail.com 555-4567 None
3 大卫 None 555-6472 Fifth Avenue