python下用HTMLParser分析网页方法

http://www.cnzzad.com/outtut/35897.html


HTMLParser是python用来解析html的模块。它可以分析出html里面的标签、数据等等,是一种处理html的简便途径。HTMLParser采用的是一种事件驱动的模式,当HTMLParser找到一个特定的标记时,它会去调用一个用户定义的函数,以此来通知程序处理。它主要的用户回调函数的命名都是以handler_开头的,都是HTMLParser的成员函数。当我们使用时,就从HTMLParser派生出新的类,然后重新定义这几个以handler_开头的函数即可。这几个函数包括:

handle_startendtag  处理开始标签和结束标签
handle_starttag     处理开始标签,比如<xx>
handle_endtag       处理结束标签,比如</xx>
handle_charref      处理特殊字符串,就是以&#开头的,一般是内码表示的字符
handle_entityref    处理一些特殊字符,以&开头的,比如 &nbsp;
handle_data         处理数据,就是<xx>data</xx>中间的那些数据
handle_comment      处理注释
handle_decl         处理<!开头的,比如<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
handle_pi           处理形如<?instruction>的东西

    这里我以从网页中获取到url为例,介绍一下。要想获取到url,肯定是要分析<a>标签,然后取到它的href属性的值。下面是代码:

先来大致看看HTMLParser的源代码:

class HTMLParseError(Exception):  
    """Exception raised for all parse errors."""  
    def __init__(self, msg, position=(None, None)):  
        assert msg  
        self.msg = msg  
        self.lineno = position[0]  
        self.offset = position[1]  
    def __str__(self):  
        result = self.msg  
        if self.lineno is not None:  
            result = result + ", at line %d" % self.lineno  
        if self.offset is not None:  
            result = result + ", column %d" % (self.offset + 1)  
        return result  
  
class HTMLParser(_markupbase.ParserBase):  
    """Find tags and other markup and call handler functions. 
    Usage: 
        p = HTMLParser() 
        p.feed(data) 
        ... 
        p.close() 
    Start tags are handled by calling self.handle_starttag() or 
    self.handle_startendtag(); end tags by self.handle_endtag().  The 
    data between tags is passed from the parser to the derived class 
    by calling self.handle_data() with the data as argument (the data 
    may be split up in arbitrary chunks).  Entity references are 
    passed by calling self.handle_entityref() with the entity 
    reference as the argument.  Numeric character references are 
    passed to self.handle_charref() with the string containing the 
    reference as the argument. 
    """  
    CDATA_CONTENT_ELEMENTS = ("script", "style")  
  
    def __init__(self):  
        """Initialize and reset this instance."""  
        self.reset()  
    def reset(self):  
        """Reset this instance.  Loses all unprocessed data."""  
        self.rawdata = ''  
        self.lasttag = '???'  
        self.interesting = interesting_normal  
        _markupbase.ParserBase.reset(self)  
    def feed(self, data):  
        """Feed data to the parser. 
        Call this as often as you want, with as little or as much text 
        as you want (may include '/n'). 
        """  
        self.rawdata = self.rawdata + data  
        self.goahead(0)  
    def close(self):  
        """Handle any buffered data."""  
        self.goahead(1)  
    def error(self, message):  
        raise HTMLParseError(message, self.getpos())  
    __starttag_text = None  
    def get_starttag_text(self):  
        """Return full source of start tag: '<...>'."""  
        return self.__starttag_text  
    def set_cdata_mode(self):  
        self.interesting = interesting_cdata  
    def clear_cdata_mode(self):  
        self.interesting = interesting_normal  
    # Internal -- handle data as far as reasonable.  May leave state  
    # and data to be processed by a subsequent call.  If 'end' is  
    # true, force handling all data as if followed by EOF marker.  
    def goahead(self, end):  
        rawdata = self.rawdata  
        i = 0  
        n = len(rawdata)  
        while i < n:  
            match = self.interesting.search(rawdata, i) # < or &  
            if match:  
                j = match.start()  
            else:  
                j = n  
            if i < j: self.handle_data(rawdata[i:j])  
            i = self.updatepos(i, j)  
            if i == n: break  
            startswith = rawdata.startswith  
            if startswith('<', i):  
                if starttagopen.match(rawdata, i): # < + letter  
                    k = self.parse_starttag(i)  
                elif startswith("</", i):  
                    k = self.parse_endtag(i)  
                elif startswith("<!--", i):  
                    k = self.parse_comment(i)  
                elif startswith("<?", i):  
                    k = self.parse_pi(i)  
                elif startswith("<!", i):  
                    k = self.parse_declaration(i)  
                elif (i + 1) < n:  
                    self.handle_data("<")  
                    k = i + 1  
                else:  
                    break  
                if k < 0:  
                    if end:  
                        self.error("EOF in middle of construct")  
                    break  
                i = self.updatepos(i, k)  
            elif startswith("&#", i):  
                match = charref.match(rawdata, i)  
                if match:  
                    name = match.group()[2:-1]  
                    self.handle_charref(name)  
                    k = match.end()  
                    if not startswith(';', k-1):  
                        k = k - 1  
                    i = self.updatepos(i, k)  
                    continue  
                else:  
                    break  
            elif startswith('&', i):  
                match = entityref.match(rawdata, i)  
                if match:  
                    name = match.group(1)  
                    self.handle_entityref(name)  
                    k = match.end()  
                    if not startswith(';', k-1):  
                        k = k - 1  
                    i = self.updatepos(i, k)  
                    continue  
                match = incomplete.match(rawdata, i)  
                if match:  
                    # match.group() will contain at least 2 chars  
                    if end and match.group() == rawdata[i:]:  
                        self.error("EOF in middle of entity or char ref")  
                    # incomplete  
                    break  
                elif (i + 1) < n:  
                    # not the end of the buffer, and can't be confused  
                    # with some other construct  
                    self.handle_data("&")  
                    i = self.updatepos(i, i + 1)  
                else:  
                    break  
            else:  
                assert 0, "interesting.search() lied"  
        # end while  
        if end and i < n:  
            self.handle_data(rawdata[i:n])  
            i = self.updatepos(i, n)  
        self.rawdata = rawdata[i:]  
    # Internal -- parse processing instr, return end or -1 if not terminated  
    def parse_pi(self, i):  
        rawdata = self.rawdata  
        assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()'  
        match = piclose.search(rawdata, i+2) # >  
        if not match:  
            return -1  
        j = match.start()  
        self.handle_pi(rawdata[i+2: j])  
        j = match.end()  
        return j  
    # Internal -- handle starttag, return end or -1 if not terminated  
    def parse_starttag(self, i):  
        self.__starttag_text = None  
        endpos = self.check_for_whole_start_tag(i)  
        if endpos < 0:  
            return endpos  
        rawdata = self.rawdata  
        self.__starttag_text = rawdata[i:endpos]  
        # Now parse the data between i+1 and j into a tag and attrs  
        attrs = []  
        match = tagfind.match(rawdata, i+1)  
        assert match, 'unexpected call to parse_starttag()'  
        k = match.end()  
        self.lasttag = tag = rawdata[i+1:k].lower()  
        while k < endpos:  
            m = attrfind.match(rawdata, k)  
            if not m:  
                break  
            attrname, rest, attrvalue = m.group(1, 2, 3)  
            if not rest:  
                attrvalue = None  
            elif attrvalue[:1] == '/'' == attrvalue[-1:] or /  
                 attrvalue[:1] == '"' == attrvalue[-1:]:  
                attrvalue = attrvalue[1:-1]  
                attrvalue = self.unescape(attrvalue)  
            attrs.append((attrname.lower(), attrvalue))  
            k = m.end()  
        end = rawdata[k:endpos].strip()  
        if end not in (">", "/>"):  
            lineno, offset = self.getpos()  
            if "/n" in self.__starttag_text:  
                lineno = lineno + self.__starttag_text.count("/n")  
                offset = len(self.__starttag_text) /  
                         - self.__starttag_text.rfind("/n")  
            else:  
                offset = offset + len(self.__starttag_text)  
            self.error("junk characters in start tag: %r"  
                       % (rawdata[k:endpos][:20],))  
        if end.endswith('/>'):  
            # XHTML-style empty tag: <span attr="value" />  
            self.handle_startendtag(tag, attrs)  
        else:  
            self.handle_starttag(tag, attrs)  
            if tag in self.CDATA_CONTENT_ELEMENTS:  
                self.set_cdata_mode()  
        return endpos  
    # Internal -- check to see if we have a complete starttag; return end  
    # or -1 if incomplete.  
    def check_for_whole_start_tag(self, i):  
        rawdata = self.rawdata  
        m = locatestarttagend.match(rawdata, i)  
        if m:  
            j = m.end()  
            next = rawdata[j:j+1]  
            if next == ">":  
                return j + 1  
            if next == "/":  
                if rawdata.startswith("/>", j):  
                    return j + 2  
                if rawdata.startswith("/", j):  
                    # buffer boundary  
                    return -1  
                # else bogus input  
                self.updatepos(i, j + 1)  
                self.error("malformed empty start tag")  
            if next == "":  
                # end of input  
                return -1  
            if next in ("abcdefghijklmnopqrstuvwxyz=/"  
                        "ABCDEFGHIJKLMNOPQRSTUVWXYZ"):  
                # end of input in or before attribute value, or we have the  
                # '/' from a '/>' ending  
                return -1  
            self.updatepos(i, j)  
            self.error("malformed start tag")  
        raise AssertionError("we should not get here!")  
    # Internal -- parse endtag, return end or -1 if incomplete  
    def parse_endtag(self, i):  
        rawdata = self.rawdata  
        assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag"  
        match = endendtag.search(rawdata, i+1) # >  
        if not match:  
            return -1  
        j = match.end()  
        match = endtagfind.match(rawdata, i) # </ + tag + >  
        if not match:  
            self.error("bad end tag: %r" % (rawdata[i:j],))  
        tag = match.group(1)  
        self.handle_endtag(tag.lower())  
        self.clear_cdata_mode()  
        return j  
    # Overridable -- finish processing of start+end tag: <tag.../>  
    def handle_startendtag(self, tag, attrs):  
        self.handle_starttag(tag, attrs)  
        self.handle_endtag(tag)  
    # Overridable -- handle start tag  
    def handle_starttag(self, tag, attrs):  
        pass  
    # Overridable -- handle end tag  
    def handle_endtag(self, tag):  
        pass  
    # Overridable -- handle character reference  
    def handle_charref(self, name):  
        pass  
    # Overridable -- handle entity reference  
    def handle_entityref(self, name):  
        pass  
    # Overridable -- handle data  
    def handle_data(self, data):  
        pass  
    # Overridable -- handle comment  
    def handle_comment(self, data):  
        pass  
    # Overridable -- handle declaration  
    def handle_decl(self, decl):  
        pass  
    # Overridable -- handle processing instruction  
    def handle_pi(self, data):  
        pass  
    def unknown_decl(self, data):  
        self.error("unknown declaration: %r" % (data,))  
    # Internal -- helper to remove special character quoting  
    entitydefs = None  
    def unescape(self, s):  
        if '&' not in s:  
            return s  
        def replaceEntities(s):  
            s = s.groups()[0]  
            if s[0] == "#":  
                s = s[1:]  
                if s[0] in ['x','X']:  
                    c = int(s[1:], 16)  
                else:  
                    c = int(s)  
                return chr(c)  
            else:  
                # Cannot use name2codepoint directly, because HTMLParser  
                # supports apos, which is not part of HTML 4  
                import html.entities  
                if HTMLParser.entitydefs is None:  
                    entitydefs = HTMLParser.entitydefs = {'apos':"'"}  
                    for k, v in html.entities.name2codepoint.items():  
                        entitydefs[k] = chr(v)  
                try:  
                    return self.entitydefs[s]  
                except KeyError:  
                    return '&'+s+';'  
        return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+|/w{1,8}));",  
                      replaceEntities, s, re.ASCII)  

  使用示例代码:找链接
#-*- encoding: gb2312 -*-
import HTMLParser

class MyParser(HTMLParser.HTMLParser):
    def __init__(self):
        HTMLParser.HTMLParser.__init__(self)        
        
    def handle_starttag(self, tag, attrs):
        # 这里重新定义了处理开始标签的函数
        if tag == 'a':
            # 判断标签<a>的属性
            for name,value in attrs:
                if name == 'href':
                    print value
        

if __name__ == '__main__':
    a = '<html><head><title>test</title><body><a href="http://www.163.com">链接到163</a></body></html>'
    
    my = MyParser()
    # 传入要分析的数据,是html的。
    my.feed(a)


第二个示例程序:找图片链接

    # -*- coding:utf-8 -*-  
    # file: GetImage.py  
    #  
    import Tkinter  
    import urllib  
    import HTMLParser  
    class MyHTMLParser(HTMLParser.HTMLParser):                                              # 创建HTML解析类  
            def __init__(self):  
                    HTMLParser.HTMLParser.__init__(self)  
                    self.gifs = []                                                          # 创建列表,保存gif  
                    self.jpgs = []                                                          # 创建列表,保存jpg  
            def handle_starttag(self, tags, attrs):                                         # 处理起始标记  
                    if tags == 'img':                                                       # 处理图片  
                            for attr in attrs:  
                                    for t in attr:  
                                            if 'gif' in t:  
                                                    self.gifs.append(t)                     # 添加到gif列表  
                                            elif 'jpg' in t:  
                                                    self.jpgs.append(t)                     # 添加到jpg列表  
                                            else:  
                                                    pass  
            def get_gifs(self):                                                             # 返回gif列表  
                    return self.gifs  
            def get_jpgs(self):                                                             # 返回jpg列表  
                    return self.jpgs  
    class Window:  
            def __init__(self, root):  
                    self.root = root                                                        # 创建组件  
                    self.label = Tkinter.Label(root, text = '输入URL:')  
                    self.label.place(x = 5, y = 15)  
                    self.entryUrl = Tkinter.Entry(root,width = 30)   
                    self.entryUrl.place(x = 65, y = 15)  
                    self.get = Tkinter.Button(root,   
                                    text = '获取图片', command = self.Get)  
                    self.get.place(x = 280, y = 15)  
                    self.edit = Tkinter.Text(root,width = 470,height = 600)  
                    self.edit.place(y = 50)  
            def Get(self):  
                    url = self.entryUrl.get()                                               # 获取URL  
                    page = urllib.urlopen(url)                                              # 打开URL  
                    data = page.read()                                                      # 读取URL内容  
                    parser = MyHTMLParser()                                                 # 生成实例对象  
                    parser.feed(data)                                                       # 处理HTML数据  
                    self.edit.insert(Tkinter.END, '====GIF====\n')                          # 输出数据  
                    gifs = parser.get_gifs()  
                    for gif in gifs:  
                            self.edit.insert(Tkinter.END, gif + '\n')  
                    self.edit.insert(Tkinter.END, '===========\n')  
                    self.edit.insert(Tkinter.END, '====JPG====\n')  
                    jpgs = parser.get_jpgs()  
                    for jpg in jpgs:  
                            self.edit.insert(Tkinter.END, jpg + '\n')  
                    self.edit.insert(Tkinter.END, '===========\n')  
                    page.close()  
    root = Tkinter.Tk()  
    window = Window(root)  
    root.minsize(600,480)  
    root.mainloop()  


示例代码二:
 
http://hi.baidu.com/muinlive/blog/item/ce584ff43c569adaf2d385b8.html

 
 近段时间想用python写一个从网页上抓取股票年报数据的工具,python 自带的Lib中htmlparser有htmllib.HTMLParser、sgmllib.SGMLParser、HTMLParser.HTMLParser,我挑了最后一个HTMLParser.HTMLParser来试试,但可惜的是网页中的table不能解释,script内容也不能过滤。其实两个也就没再试了。后来找下发现一个叫Beautiful Soup的东东听说很好用,但学是没有试。下面一段代码是创建将网面的内容创建成一个对象,这样可以更方便读取它的内容,可以按TAG的分类进行读取,可惜HTMLParser功能太弱,没做成自己想要的结果。等下再试试那可口的浓汤。 
 
#! /usr/bin/env python
# -*- coding:gb18030 -*-

from HTMLParser import HTMLParser
import re

class HtmlTag:
    def __init__(self,parent,tagname):
        self.tagname=tagname
        self.attrs={}
        self.parent=parent
        self.childs=[]
        self.data=''
    def setattr(self,name,value):
        self.attrs[name]=value
    def addchild(self,child):
        self.childs.append(child)
    def setdata(self,data):
        self.data=data

class htmlsnif (HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self._ActiveTag=HtmlTag(None,'Root')
        self._TagTile=[]
        self._TagTree=self._ActiveTag
        self._TagCatalog={}
        self._ParentTag=self._ActiveTag
        self._TagTitle=None
        self._TagBody=None

    def handle_starttag(self,tag,attrs):
        newtag=HtmlTag(self._ActiveTag,tag)
        for k,v in attrs:
            newtag.setattr(k,v)
        self._TagTile.append(newtag)
        self._ActiveTag.addchild(newtag)
        self._ParentTag=self._ActiveTag
        self._ActiveTag=newtag
        if str.lower(tag)=='title':
            self._TagTitle=newtag
        elif str.lower(tag)=='body':
            self._TagBody=newtag
        if tag in self._TagCatalog:
            self._TagCatalog[tag].append(newtag)
        else:
            self._TagCatalog[tag]=[newtag]
        print(tag)

    def handle_endtag(self,tag):
        self._ParentTag=self._ParentTag.parent
        self._ActiveTag=self._ActiveTag.parent

    def handle_data(self,data):
        self._ActiveTag.setdata(data)

    def handle_startendtag(self,tag,attrs):
        newtag=HtmlTag(self._ActiveTag,tag)
        for k,v in attrs:
            newtag.setattr(k,v)
        self._ActiveTag.addchild(newtag)
        self._TagTile.append(newtag)
        if tag in self._TagCatalog:
            self._TagCatalog[tag].append(newtag)
        else:
            self._TagCatalog[tag]=[newtag]

    def handle_comment(self,data):
        newtag=HtmlTag(self._TagTree,'comment')
        newtag.setdata(data)
        self._TagTile.append(newtag)
        if 'comment' in self._TagCatalog:
            self._TagCatalog['comment'].append(newtag)
        else:
            self._TagCatalog['comment']=[newtag]

另外一个例子:http://crquan.blogbus.com/logs/8269701.html
标签过滤????
#!/usr/bin/env python
 
import sys
import urllib
import HTMLParser
 
class CustomParser(HTMLParser.HTMLParser):
    selected = ('table', 'h1', 'font', 'ul', 'li', 'tr', 'td', 'a')
    
    def reset(self):
        HTMLParser.HTMLParser.reset(self)
        self._level_stack = []
    def handle_starttag(self, tag, attrs):
        if tag in CustomParser.selected:
            self._level_stack.append(tag)
    def handle_endtag(self, tag):
        if self._level_stack \
        and tag in CustomParser.selected \
        and tag == self._level_stack[-1]:
            self._level_stack.pop()
    def handle_data(self, data):
        if "/".join(self._level_stack) in (
            'table/tr/td',
            'table/tr/td/h1/font',
            'table/tr/td/ul/li'):
            print self._level_stack, data
        
if len(sys.argv) > 1:
    params = urllib.urlencode({'ip': sys.argv[1], 'action': 2})
else:
    params = None
 
content = unicode(urllib.urlopen('http://www.ip138.com/ips8.asp',params).read(), 'GB2312')
 
parser = CustomParser()
parser.feed(content)
parser.close()


  • 0
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值