python 爬虫

爬虫程序主要是把第n层网页的连接也下载下来
主程序
爬虫启动
生成一个队列
f(x) 循环 队列为空跳出
网址出队列
下载网页 找下一层连接
添加到队列

 

from sys import argv
from os import makedirs,unlink,sep
from os.path import dirname,exists,isdir,splitext
from string import replace ,find,lower
from htmllib import HTMLParser
from urllib import urlretrieve
from urlparse import urlparse,urljoin
from formatter import DumbWriter,AbstractFormatter
from cStringIO import StringIO
import os,sys

syspath=sys.argv[0]


class retri(object):
    def __init__(self,url):
        self.url=url
        self.file=self.filename(url)
        
    def filename(self,url,deffile='index.htm'):
        parsedurl=urlparse(url,'http:',0)
        if parsedurl[2]=='':
            path=parsedurl[1]+'//index.htm'
        else:
            path=parsedurl[1]+parsedurl[2]
        ext=splitext(path)
        if ext[1]=='':
            if path[-1]=='/':
                path+=deffile
            else:
                path+='/'+deffile
        ldir=dirname(path)
#        ldir=path
        if sep !='/':
            ldir =replace(ldir,'/',sep)
        if not isdir(ldir):
            if exists(ldir):
                unlink(ldir)
            makedirs(ldir)
        return path
#        return parsedurl[2]
    
    
    def download(self):
        try:
            retval=urlretrieve(self.url,self.file)
            return retval
        except IOError:
            retval=('*** error:invalid url "%s"'%self.url)
            return retval
            
    def parse_and_getlink(self):
        self.parser=
        (AbstractFormatter(DumbWriter(StringIO())))
        self.parser.feed(open(self.file).read())
        self.parser.close()
        return self.parser.anchorlist
    
    
    
class crawler(object):
    count=0
    def __init__(self,url):
        self.q=[url]
        self.seen=[]
        self.dom=urlparse(url)[1]
    
    def get_page(self,url):
        r=retri(url)
        retval=r.download()
        if retval[0]=='*':
            print retval,'.. skipping parse'
            return
        crawler.count+=1
        print '\n(',crawler.count,')'
        print 'url:',url
        print 'file:',retval[0]
        self.seen.append(url)
        
        links=r.parse_and_getlink()
        for eachlink in links:
            if eachlink[:4]!='http' and find(eachlink,'://')==-1:
                eachlink=urljoin(url,eachlink)
            print '* ',eachlink
            
            if find(lower(eachlink),'mailto:')!=-1:
                print '... discarded,mailto link'
                continue
            
            if eachlink not in self.seen:
                if find(eachlink,self.dom)==-1:
                    print '...discarded,not in domain'
                else:
                    if eachlink not in self.q:
                        self.q.append(eachlink)
                        print '...new,added to q'
                    else:
                        print '...discarded,already in q'
                        
            else:
                print '... discarded,already processed'
        
        
        
    def go(self):
        while self.q:
            url=self.q.pop()
            self.get_page(url)
            
            
def main():
    if len(argv)>1:
        url=argv[1]
    else:
        try:
            url=raw_input('enter starting url:')
        except(KeyboardInterrupt,EOFError):
            url=''
    if not url:return
    robot =crawler(url)
    robot.go()
    
if __name__=='__main__':
    main()
        
        
    
    

 

转载于:https://www.cnblogs.com/frog2008/p/6845306.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值