一个爬虫例子

 
#!/usr/bin/python -u
import sys, urllib, hashlib, htmllib, os, formatter, string
 
class Parser(htmllib.HTMLParser):
    def __init__(self, verbose = 0):
        self.anchors = {}
        f = formatter.NullFormatter()
        htmllib.HTMLParser.__init__(self, f, verbose)
 
    def anchor_bgn(self, href, name, type):
        self.save_bgn()
        self.anchor = href
 
    def anchor_end(self):
        text = string.strip(self.save_end())
        if self.anchor and text:
            self.anchors[text] = self.anchors.get(text, []) + [self.anchor]
 
def getmainurl(url):
 
    ind = url.find('/',len('http://'))
    if ind > 0 :
        return url[:ind]
    else :
        return url
 
def getURL(url, html, queue):
    p = Parser()
    try :
        p.feed(html)
        p.close()
    except:
        return
    mainurl = getmainurl(url)
    for k, v in p.anchors.items():
        for u in v :
            if not u.startswith('http://'):
                if (mainurl[-1] != '/' and u[0] != '/') :
                    u = mainurl + '/' + u
                else :
                    u = mainurl + u
            hashNum = hashlib.md5(u);
            hashNum.digest()
            filename = hashNum.hexdigest()
            filename = filename + ".html"
            filename = "d:\\web\\" + filename
     if os.path.isfile(filename) == False:
         queue.append(u);
def BFS():
    queue = ["http://www.xxxxxx.com/"]
    while len(queue) != 0 :
        print len(queue)
        url = queue.pop(0);
        #Init the URL
        try:
            wp = urllib.urlopen(url)
        #Open Connection
        except:
            print url, "can not open this url"
            wp.close()
            continue
       
        content = wp.read()
        #get content
        wp.close()
        hashNum = hashlib.md5(url);
       
        hashNum.digest()
       
        filename = hashNum.hexdigest()
        filename = filename + ".html"
        filename = "d:\\web\\" + filename
        if os.path.isfile(filename) == False:
            fp = open(filename,"w")
            #open file     
            fp.write(content)
            #write
            fp.close()
            #close
        else :
            continue
        getURL(url, content, queue)
 
def main():
     while True :
        BFS()
 
if __name__ == '__main__' :
    main()
       

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值