# -*- coding: utf-8 -*-
"""
Created on Thu Apr 16 23:18:27 2015
@author: shifeng
"""
'''
功能:解析CDR_sample.xml文件,输出格式为DNorm接收的格式,并将训练集的“label”写入到文档中
xml文件:见CSDN资源共享
参考博客:http://www.cnblogs.com/fnng/p/3581433.html
'''
import codecs
import StringIO
import xml
from lxml import etree
from xml.sax import *
from xml.sax.handler import *
from xml.etree import ElementTree as ET
import xml.dom.minidom
dom = xml.dom.minidom.parse("CDR_sample.xml")
root = dom.documentElement
#print root.nodeName
#print root.nodeValue
#print root.nodeType
#print root.ELEMENT_NODE
#-----------
'''
方法一(未采纳):
#知道元素名字的子元素,使用getElementsByTagName方法获取
#colloction为根节点,有四个元素,知道其名,通过root.getElementsByTagName(i)便能取出其子元素
colloction_ele = ["source", "date", "key", "document"]
for i in colloction_ele:
print root.getElementsByTagName(i)[0].nodeName #获取标签名字
# print root.getElementsByTagName(i)[0].getAttribute
#documents有三个标签
document_ele = ["id", "passage", "annotation"]
documents = root.getElementsByTagName("document")
#print len(documents)
for i in documents: #对每个文档,
for j in document_ele: #取出每个标签
print i.getElementsByTagName(j)[0].nodeName #获取标签名字
print i.getElementsByTagName(j)[0].firstChild.data #获取标签之间的数据
if j == "annotation":
print i.getElementsByTagName(j)[0].getAttribute("id") #获取标签属性
'''
#-----------
write_text = open("train_text.txt","w")
#-----------
root_2 = ET.parse("CDR_sample.xml")
documents = root_2.findall("./document")
for per in documents: #找到所有document
for child in per: #对于每个document解析其标签id,passage,annotation
child_tag = child.tag
if child_tag =="id":
text_id = child.text
print child_tag,":",text_id
write_text.write(text_id+"\t") #写入文件,id和tab符号
elif child_tag =="passage": #对每个passage进行处理
passages = child
for passage in passages: #每个document标签下,有多个passage标签,
#passage有四种标签,对每种标签进行处理
passage_tag = passage.tag
if passage_tag == "offset": #r如果是偏移量,取出偏移量
offset = int(passage.text)
print "offset:",offset
elif passage_tag == "text": #如果是文本,取出文本,title_text或者abstract_text
text =passage.text
print passage_tag,"::",text
write_text.write(text) #写入文件,title_text和abstract_text两个,连续写在一起
elif passage_tag =="annotation": #如果是标注的,
annotations = passage
print 10*"*"
for annotation in annotations: #每个passage标签下,annotation有四种标签,对每种标签处理
annotation_tag = annotation.tag
# print annotation_tag,"+++++++++++++++++++"
if annotation_tag == "location":
print annotation.attrib["offset"],annotation.attrib["length"]
elif annotation_tag == "text":
diease_name = annotation.text
print diease_name
elif annotation_tag == "infon" and annotation.attrib["key"] !="type":
#每个passage标签下,有多个annotation,每个annotation下有两个infon标签,取第二个
infons = annotation
print infons.attrib["key"],infons.text
# for infon in infons:
# print infon.attrib["key"]
elif child_tag =="annotation": #document_ele[2]: #annotation
annotation = child
write_text.write("\n") #每个文档遍历完一遍后,加一个换行符号
print 30*"*"
write_text.close()
#“label”对照待续....
'''
doc = etree.parse("CDR_sample.xml")
xml_string = etree.tostring(doc)
root = etree.fromstring(xml_string)
parser = make_parser()
# MarkDecodeHandler
# MarkDecodeHandler
handler = UserDecodeHandler()
parser.setContentHandle(handler)
parser.parse(root)
for item in handler.marks:
for j in item.items():
print i,j
print type(doc)
print type(root)
# print doc.tag
print root.tag
# with codecs.open("CDR_sample.xml") as xml:
# text = xml.readlines()
# s_xml = ""
# for i in text:
# i=i.strip("\n")
# s_xml+=i
# print s_xml
# soup = BeautifulSoup(s_xml)
# print soup.title
# for i in text:
# print i
'''
python xml解析例子
最新推荐文章于 2022-08-02 17:44:35 发布