python 专题七 HTML XML解析

HTMLParser 解析 Title 和body

from htmlentitydefs import entitydefs 
import HTMLParser 
class TitleParser(HTMLParser.HTMLParser): 
    def __init__(self): 
        self.taglevels=[] 
        self.handledtags=['title','body'] 
        self.processing=None 
        HTMLParser.HTMLParser.__init__(self) 
    def handle_starttag(self,tag,attrs): 
        if tag in self.handledtags: 
            self.data='' 
            self.processing=tag 
    def handle_data(self,data): 
        if self.processing: 
            self.data +=data 
    def handle_endtag(self,tag): 
        if tag==self.processing: 
            print str(tag)+':'+str(tp.gettitle()) 
            self.processing=None 
    def handle_entityref(self,name): 
        if entitydefs.has_key(name): 
            self.handle_data(entitydefs[name]) 
        else: 
            self.handle_data('&'+name+';') 

    def handle_charref(self,name): 
        try: 
            charnum=int(name) 
        except ValueError: 
            return 
        if charnum<1 or charnum>255: 
            return 
        self.handle_data(chr(charnum)) 

    def gettitle(self): 
        return self.data 
fd=open('test1.html') 
tp=TitleParser() 
tp.feed(fd.read()) 

XML解析

"""
解析XML文件
1.Element XML树的节点
2.Text代表文本,包括Element的换行符
3.scanNode为一递归函数,如果当前的节点有子节点,进行递归调用
4.Node的类型
    ELEMENT_NODE = 1
    ATTRIBUTE_NODE = 2
    TEXT_NODE = 3
    CDATA_SECTION_NODE = 4
    ENTITY_REFERENCE_NODE = 5
    ENTITY_NODE = 6
    PROCESSING_INSTRUCTION_NODE = 7
    COMMENT_NODE = 8
    DOCUMENT_NODE = 9
    DOCUMENT_TYPE_NODE = 10
    DOCUMENT_FRAGMENT_NODE = 11
    NOTATION_NODE = 12
"""
from xml.dom import minidom,Node
Node.TEXT_NODE
def scanNode(node,level = 0):
    msg = node.__class__.__name__
    if node.nodeType == Node.ELEMENT_NODE:
        msg += ",tag" + node.tagName
    print " " * level * 4,msg
    if node.hasChildNodes:
        for child in node.childNodes:
            scanNode(child,level + 1)

doc = minidom.parse("JCSample.xml")
scanNode(doc)

使用DOM解析XMl

from xml.dom import minidom, Node
import re, textwrap
########################################################################
class SampleScanner:
    """"""
    #----------------------------------------------------------------------
    def __init__(self, doc):
        """Constructor"""
        assert(isinstance(doc, minidom.Document))
        for child in doc.childNodes:
            if child.nodeType == Node.ELEMENT_NODE and \
               child.tagName == "book":
                self.handle_book(child)
                
    def handle_book(self, node):
        
        for child in node.childNodes:
            if child.nodeType != Node.ELEMENT_NODE:
                continue
            if child.tagName == "title":
                print "Book titile is:", self.gettext(child.childNodes)
            if child.tagName == "author":
                self.handle_author(child)
            if child.tagName == "chapter":
                self.handle_chapter(child)
                
    def handle_chapter(self, node):
        number = node.getAttribute("number")
        print "number:", number
        title_node = node.getElementsByTagName("title")
        print "title:", self.gettext(title_node)
        
        for child in node.childNodes:
            if child.nodeType != Node.ELEMENT_NODE:
                continue
            if child.tagName == "para":
                self.handle_chapter_para(child)
                
    def handle_chapter_para(self, node):
        company = ""
        company = self.gettext(node.getElementsByTagName("company"))
        print "chapter:para:company", company
        
                
    def handle_author(self, node):
        for child in node.childNodes:
            if child.nodeType != Node.ELEMENT_NODE:
                continue
            if child.tagName == "name":
                self.handle_author_name(child)
            if child.tagName == "affiliation":
                print "affiliation:", self.gettext(child.childNodes)
                
    def handle_author_name(self, node):
        first = ""
        last = ""
        for child in node.childNodes:
            if child.nodeType != Node.ELEMENT_NODE:
                continue
            if child.tagName == "first":
                first = self.gettext(child.childNodes)
            if child.tagName == 'last':
                last = self.gettext(child.childNodes)
                
        print "firstname:%s,lastname:%s" % (first, last)
        
                
    def gettext(self, nodelist):
        retlist = []
        for node in nodelist:
            if node.nodeType == Node.TEXT_NODE:
                retlist.append(node.wholeText)
            elif node.hasChildNodes:
                retlist.append(self.gettext(node.childNodes))
                
        return re.sub('\s+', " ", ''.join(retlist))
                    
if __name__=="__main__":
    doc = minidom.parse("simple.xml")
    sample = SampleScanner(doc)
    

XML如下

<?xml version="1.0" ?>
<!--Simple xml document__chapter 8-->
<book>
	<title>
		sample xml thing
	</title>
	<author>
		<name>
			<first>
				ma
			</first>
			<last>
				xiaoju
			</last>
		</name>
		<affiliation>
			Springs Widgets, Inc.
		</affiliation>
	</author>
	<chapter number="1">
		<title>
			First
		</title>
		<para>
			I think widgets are greate.You should buy lots of them forom
			<company>
				Spirngy Widgts, Inc
			</company>
		</para>
	</chapter>
</book>




  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值