XML 指可扩展标记语言(eXtensible Markup Language),被设计用来传输和存储数据。
注意:
1.有且只有一个根元素
2.标签必须有关闭标签
3.属性值须加引号
4.属性值须加引号
5.标签名不能有空格,不能以数字或"_" (下划线)开头
6.不能以xml(或XML、或Xml 等)开头,名称中间不能包含冒号(:)
7.空格/回车/制表符在xml中都是文本节点
8.必须正确地嵌套
python中创建一个 .xml文件,
#格式严格,不要随便删除任意字符包括空格
<?xml version="1.0" encoding="UTF-8" ?> <!DOCTYPE students [ <!ELEMENT students (student+)> <!ELEMENT student (name,age,sex)> <!ELEMENT name (#PCDATA)> <!ELEMENT age (#PCDATA)> <!ELEMENT sex (#PCDATA)> <!ATTLIST student id CDATA #REQUIRED> ]> <students> <student id="1"> <name>张三</name> <age>19</age> <sex>男</sex> </student> <student id="2"> <name>韩梅梅</name> <age>18</age> <sex>女</sex> </student> <student id="3"> <name>欧阳俊杰</name> <age>21</age> <sex>男</sex> </student>
三中解析方式:
dom
from xml.dom.minidom import parse stus=parse("XML.xml")#XML为xml文件名 print(stus) root=stus.documentElement print(root) students=root.getElementsByTagName("student") class Student: def __init__(self,id=None,name=None,age=None,sex=None): self.id=id self.name=name self.age=age self.sex=sex def __repr__(self): if len(self.name.encode("utf-8"))<=8: return self.id + "\t" + self.name + "\t\t" + self.age + "\t" + self.sex else: return self.id+"\t"+self.name+"\t"+self.age+"\t"+self.sex #不写if,else会出现因为名字字数不同年龄不能对齐的请况 #return self.id + "\t" + self.name.ljust(10) + "\t" + self.age + "\t" + self.sex # .ljust是给name从坐到右加宽度,但是中文比空格宽,在这里没用 for s in students: name=s.getElementsByTagName("name")[0].childNodes[0].data age = s.getElementsByTagName("age")[0].childNodes[0].data sex = s.getElementsByTagName("sex")[0].childNodes[0].data id = s.getAttribute("id") # 得到ID,打.不出 # print(name) # print(age) # print(sex) # print(id) stuList = [] s=Student(id,name,age,sex) stuList.append(s) print(stuList) # #[{name:},{age:},{sex:},{ID:}] # d=[] # d.append({"姓名":name}) # d.append({"年龄": age}) # d.append({"性别": sex}) # d.append({"ID": id}) # print(d)
sax
from xml.sax import parse from xml.sax.handler import ContentHandler # class SaxParser(ContentHandler): # def __init__(self,name=None): # self.name=name # def startDocument(self): # print("xml文件开始...") # def endDocument(self): # print("xml文件结束...") # def startElement(self, name, attrs): # print("标签"+name+"开始") # self.name=name # def characters(self, content): # if self.name=="name": # print("标签内容"+content) # elif self.name=="age": # print("标签内容"+content) # elif self.name == "sex": # print("标签内容"+content) # def endElement(self, name): # print("标签"+name+"结束") # self.name=None # parse("XML.xml",SaxParser()) class Student: def __init__(self,id=None,name=None,age=None,sex=None): self.id=id self.name=name self.age=age self.sex=sex def __repr__(self): if len(self.name.encode('UTF-8'))<=8: return self.id+'\t'+self.name+'\t\t'+self.age+'\t'+self.sex else: return self.id + '\t' + self.name + '\t' + self.age + '\t' + self.sex stulist=[] class saxParse(ContentHandler): def __init__(self,name=None): self.name=name self.stu=None def startDocument(self): pass def endDocument(self): pass def startElement(self,name,attrs): if name=='student': self.stu=Student() self.stu.id = attrs['id'] self.name=name def endElement(self,name): if name=='student': stulist.append(self.stu) self.name=None def characters(self, content): if self.name=="name": self.stu.name=content elif self.name=="age": self.stu.age = content elif self.name == "sex": self.stu.sex = content parse("XML.xml",saxParse()) for i in stulist: print(i)
element tree
# try: # import xml.etree.cElementTree as ET # except ImportError: # import xml.etree.ElementTree as ET # def test(): # tree=ET.parse("XML.xml") # students=tree.findall("student") # for stu in students: # children=stu.getchildren() # for c in children: # print(c.text) # test() class Student: def __init__(self,id=None,name=None,age=None,sex=None): self.id=id self.name=name self.age=age self.sex=sex def __repr__(self): if len(self.name.encode("utf-8"))<=8: return self.id + "\t" + self.name + "\t\t" + self.age + "\t" + self.sex else: return self.id+"\t"+self.name+"\t"+self.age+"\t"+self.sex try: import xml.etree.cElementTree as ET except ImportError: import xml.etree.ElementTree as ET stuList=[] def test(): tree=ET.parse("XML.xml") students=tree.findall("student") for stu in students: student=Student() student.id=stu.attrib["id"] children=stu.getchildren() student.name=children[0].text student.age = children[1].text student.sex = children[2].text stuList.append(student) test() for stu in stuList: print(stu)
sax:逐行的解析,不能增删改
dom:把整个文档加载到内存中,翻译成一棵树,就可以进行crud操作
element tree:最为简单