一.简介
二.程序
该爬虫程序包含2个类,一个管理整个crawling进程(Crawler),一个检索并解析每一个下载的web页面(Retriever)。
二.程序
- #!/usr/bin/env python
- from sys import argv
- from os import makedirs,unlink,sep
- from os.path import dirname,exists,isdir,splitext
- from string import replace,find,lower
- from htmllib import HTMLParser
- from urllib import urlretrieve
- from urlparse import urlparse,urljoin
- from formatter import DumbWriter,AbstractFormatter
- from cStringIO import StringIO
- class Retriever(object): #download web pages
- def __init__(self,url):
- self.url = url
- self.file = self.filename(url)
- def filename(self,url,deffile='index.htm'):
- parsedurl = urlparse(url,'http:',0) ## parse path
- path = parsedurl[1] + parsedurl[2]
- ext = splitext(path)
- if ext[1] == '' : #no file,use default
- if path[-1] == '/':
- path += deffile
- else:
- path += '/' + deffile
- ldir = dirname(path) #local directory
- if sep != '/': # os-indep. path separator
- ldir = replace(ldir,'/',sep)
- if not isdir(ldir): # create archive dir if nec.
- if exists(ldir): unlink(ldir)
- makedirs(ldir)
- return path
- def download(self): #download Web page
- try:
- retval = urlretrieve(self.url,self.file)
- except IOError:
- retval = ('*** ERROR: invalid URL "%s"' % \
- self.url,)
- return retval
- def parseAndGetLinks(self): #parse HTML,save links
- self.parser = HTMLParser(AbstractFormatter(\
- DumbWriter(StringIO())))
- self.parser.feed(open(self.file).read())
- self.parse.close()
- return self.parser.anchorlist
- class Crawler(object): #manage entire crawling process
- count = 0 #static downloaded page counter
- def __init__(self,url):
- self.q = [url]
- self.seen = [] #have seen the url
- self.dom = urlparse(url)[1]
- def getPage(self,url):
- r = Retriever(url)
- retval = r.download()
- if retval[0] == '*': # error situation,do not parse
- print retval,'... skipping parse'
- return
- Crawler.count += 1
- print '\n(',Crawler.count,')'
- print 'URL:',url
- print 'FILE:',retval[0]
- self.seen.append(url)
- links = r.parseAndGetLinks() #get and process links
- for eachLink in links:
- if eachLink[:4] != 'http' and \
- find(eachLink,'://') == -1:
- eachLink = urljoin(url,eachLink)
- print '* ',eachLink,
- if find(lower(eachLink),'mailto:') != -1:
- print '... discarded,mailto link'
- continue
- if eachLink not in self.seen:
- if find(eachLink,self.dom) == -1:
- print '... discarded, not in domain'
- else:
- if eachLink not in self.q:
- self.q.append(eachLink)
- print '... new, added to Q'
- else:
- print '... discarded, already in Q'
- else:
- print '... discarded, already processed'
- def go(self): # process links in queue
- while self.q:
- url = self.q.pop()
- self.getPage(url)
- def main():
- if len(argv) > 1:
- url = argv[1]
- else:
- try:
- url = raw_input('Enter starting URL:')
- except(KeyboardInterrupt,EOFError):
- url = ''
- if not url: return
- robot = Crawler(url)
- robot.go()
- if __name__ == '__main__':
- main()