XML介绍
- XML(eXtensible Markup Language) 指可扩展标记语言,被设计用来传输和存储数据
- 标签成对出现
- 区分大小写(大小写敏感)
- 标签要正确嵌套
- 开始部分:
XML的解析的三种方法:DOM和SAX和ElmentTree
DOM解析
- 文件对象模型(Document Object Model,简称DOM)
- 是W3C组织推荐的处理可扩展置标语言的标准编程接口。一个 DOM 的解析器在解析一个XML文档时,一次性读取整个文档,把文档中所有元素保存在内存中的一个树结构里,之后你可以利用DOM 提供的不同的函数来读取或修改文档的内容和结构,也可以把修改过的内容写入xml文件
SAX解析
- SAX是一种基于事件驱动的API,利用SAX解析XML牵涉到两个部分:解析器和事件处理器
- 其中解析器负责读取XML文档,并向事件处理器发送事件,如元素开始和结束事件;而事件处理器则负责对事件作出相应,对传递的XML数据进行处理。python中使用sax方式处理xml要先引入xml.sax中的parse函数,还有xml.sax.handler中的ContentHandler。
- 常使用在如下的情况下:一、对大型文件进行处理;二、只需要文件的部分内容,或者只需从文件中得到特定信息;三、想建立自己的对象模型的时候。
ElementTree(元素树解析)
- ElementTree就像一个轻量级的DOM,具有方便友好的API。代码可用性好,速度快,消耗内存少。
注意:
- 因DOM需要将XML数据映射到内存中的树,一是比较慢,二是比较耗内存
- SAX流式读取XML文件,比较快,占用内存少,但需要用户实现回调函数(handler)
- ElementTree速度最快,代码简单,是我们解析XML的首选
代码演示
students.xml
<?xml version="1.0" encoding="utf-8"?>
<student title="student">
<name>孟浩鹏</name>
<age>23</age>
<sex>男</sex>
<score>150</score>
</student>
movies.xml
<?xml version="1.0" encoding="utf-8"?>
<collection shelf="影片推荐">
<movie title="《战狼》">
<type>爱国</type>
<format>MP4</format>
<year>2016</year>
<rating>B</rating>
<stars>8</stars>
<description>超燃枪战大片</description>
</movie>
<movie title="《80后》">
<type>青春</type>
<format>3GP</format>
<year>2003</year>
<rating>A</rating>
<stars>9</stars>
<description>青春爱情故事</description>
</movie>
</collection>
SAX解析XML
SAX解析XML方法一:
#!/usr/bin/python
# -*- coding: UTF-8 -*-
from xml.sax import ContentHandler
from xml.sax import parse
import xml.sax
"""
从行开始,遇到标签之前,存在字符,content的值为这些字符串。
从一个标签,遇到下一个标签之前, 存在字符,content的值为这些字符串。
从一个标签,遇到行结束符之前,存在字符,content的值为这些字符串。
标签可以是开始标签,也可以是结束标签。
startDocument()方法:文档启动的时候调用。
endDocument()方法:解析器到达文档结尾时调用。
startElement(name, attrs)方法:遇到XML开始标签时调用,name是标签的名字,attrs是标签的属性值字典。
endElement(name)方法:遇到XML结束标签时调用。
"""
class MovieHandler(ContentHandler):
def __init__(self):
self.CurrentData = ""
self.type = ""
self.format = ""
self.year = ""
self.rating = ""
self.stars = ""
self.description = ""
# 元素开始事件处理
def startElement(self, tag, attributes):
self.CurrentData = tag
if tag == "movie":
print("*****Movie*****")
title = attributes["title"]
print("Title:", title)
# 元素结束事件处理
def endElement(self, tag):
if self.CurrentData == "type":
print("Type:", self.type)
elif self.CurrentData == "format":
print("Format:", self.format)
elif self.CurrentData == "year":
print("Year:", self.year)
elif self.CurrentData == "rating":
print("Rating:", self.rating)
elif self.CurrentData == "stars":
print("Stars:", self.stars)
elif self.CurrentData == "description":
print("Description:", self.description)
self.CurrentData = ""
# 内容事件处理
def characters(self, content):
if self.CurrentData == "type":
self.type = content
elif self.CurrentData == "format":
self.format = content
elif self.CurrentData == "year":
self.year = content
elif self.CurrentData == "rating":
self.rating = content
elif self.CurrentData == "stars":
self.stars = content
elif self.CurrentData == "description":
self.description = content
if __name__ == "__main__":
"""
创建一个 SAX 解析器并解析xml文档
xmlfile:xml文件名
contenthandler:必须是一个ContentHandler的对象
errorhandler:如果指定该参数,errorhandler必须是一个SAX ErrorHandler对象
xml.sax.parse(xmlfile, contenthandler[, errorhandler])
"""
"""
创建一个 SAX 解析器并解析xml字符串
xmlstring:xml字符串
xml.sax.parseString(xmlstring, contenthandler[, errorhandler])
"""
# 创建一个解析器对象并返回
parser = xml.sax.make_parser()
# turn off namepsaces
parser.setFeature(xml.sax.handler.feature_namespaces, 0)
# 重写 ContextHandler
handler = MovieHandler()
parser.setContentHandler(handler)
parser.parse("movies.xml")
# parse("movies.xml", handler)
print("over")
SAX解析XML方法二:
from xml.parsers.expat import ParserCreate
class Student:
def __init__(self, name=None, age=None, sex=None, score=None):
self.name = name
self.age = age
self.sex = sex
self.score = score
def __str__(self):
return "姓名:{0},年龄:{1},性别:{2},成绩:{3}".format(self.name, self.age, self.sex, self.score)
students = []
class MySaxHandler(object):
def __init__(self):
self.tag = None
self.student = None
def start_element(self, name, attrs):
print('start_element: %s---attrs: %s' % (name, str(attrs)))
self.tag = name
if name == "student":
self.student = Student()
def char_data(self, text):
print('content: %s' % text)
if self.tag == "name":
self.student.name = text
if self.tag == "age":
self.student.age = text
if self.tag == "sex":
self.student.sex = text
if self.tag == "score":
self.student.score = text
def end_element(self, name):
print('end_element: %s' % name)
if name == "student":
students.append(self.student)
self.student = None
self.tag = None
with open("students.xml", "r", encoding="utf-8") as stu:
content = stu.read()
handler = MySaxHandler()
parser = ParserCreate()
parser.StartElementHandler = handler.start_element
parser.CharacterDataHandler = handler.char_data
parser.EndElementHandler = handler.end_element
parser.Parse(content)
for student in students:
print(student)
ElementTree解析XML
try:
import xml.etree.cElementTree as ET # C实现的(推荐,因为解析更快)
except ImportError:
import xml.etree.ElementTree as ET # Python实现的
# 从Python3.3开始ElementTree模块会自动寻找可用的C库来加快速度
class Movie:
def __init__(self, title=None, type=None, format=None, year=None, rating=None, description=None, stars=None):
self.title = title
self.type = type
self.format = format
self.year = year
self.rating = rating
self.stars = stars
self.description = description
def __str__(self):
return "片名:{0},类型:{1},格式:{2},年代:{3},评级:{4},星级:{5},描述:{6},".format(self.title, self.type, self.format, self.year,
self.rating, self.stars, self.description)
def analysis_xml(xml_name):
"""解析xml方法一"""
people = []
root = ET.parse(xml_name)
data = root.findall("movie")
for element in data:
movie = Movie()
movie.title = element.attrib["title"]
movie.type = element.find("type").text
movie.format = element.find("format").text
movie.year = element.find("year").text
movie.rating = element.find("rating").text
movie.stars = element.find("stars").text
movie.description = element.find("description").text
people.append(movie)
return people
def xml_analysis():
"""解析xml方法二"""
tree = ET.ElementTree(file="movies.xml")
root = tree.getroot()
tag = root.tag
attrib = root.attrib
print(tag)
print(attrib)
print(attrib["shelf"])
print("*" * 50)
for child in root:
print(child.tag)
print(child.attrib)
for num in range(0, len(child)):
print("tag:%s---content:%s" % (child[num].tag, child[num].text))
def xml_parse():
"""解析xml方法三"""
tree = ET.ElementTree(file="movies.xml")
root = tree.getroot()
for elem in tree.iter(tag="collection"):
print("tag:%s--attrib:%s" % (elem.tag, elem.attrib))
for elem in tree.iter(tag="movie"):
print("tag:%s--attrib:%s" % (elem.tag, elem.attrib))
print("*" * 50)
for elem in tree.iter():
print("tag:%s---content:%s" % (elem.tag, elem.text))
if __name__ == "__main__":
movies = analysis_xml("movies.xml")
for movie in movies:
print(movie)
# xml_analysis()
# xml_parse()