python实现网络爬虫

最新推荐文章于 2022-12-05 21:38:59 发布

I天辉I

最新推荐文章于 2022-12-05 21:38:59 发布

阅读量994

点赞数

一.简介

该爬虫程序包含2个类，一个管理整个crawling进程（Crawler），一个检索并解析每一个下载的web页面(Retriever)。

二.程序

[python] view plain copy

#!/usr/bin/env python
from sys import argv
from os import makedirs,unlink,sep
from os.path import dirname,exists,isdir,splitext
from string import replace,find,lower
from htmllib import HTMLParser
from urllib import urlretrieve
from urlparse import urlparse,urljoin
from formatter import DumbWriter,AbstractFormatter
from cStringIO import StringIO
class Retriever(object): #download web pages
def __init__(self,url):
self.url = url
self.file = self.filename(url)
def filename(self,url,deffile='index.htm'):
parsedurl = urlparse(url,'http:',0) ## parse path
path = parsedurl[1] + parsedurl[2]
ext = splitext(path)
if ext[1] == '' : #no file,use default
if path[-1] == '/':
path += deffile
else:
path += '/' + deffile
ldir = dirname(path) #local directory
if sep != '/': # os-indep. path separator
ldir = replace(ldir,'/',sep)
if not isdir(ldir): # create archive dir if nec.
if exists(ldir): unlink(ldir)
makedirs(ldir)
return path
def download(self): #download Web page
try:
retval = urlretrieve(self.url,self.file)
except IOError:
retval = ('*** ERROR: invalid URL "%s"' % \
self.url,)
return retval
def parseAndGetLinks(self): #parse HTML,save links
self.parser = HTMLParser(AbstractFormatter(\
DumbWriter(StringIO())))
self.parser.feed(open(self.file).read())
self.parse.close()
return self.parser.anchorlist
class Crawler(object): #manage entire crawling process
count = 0 #static downloaded page counter
def __init__(self,url):
self.q = [url]
self.seen = [] #have seen the url
self.dom = urlparse(url)[1]
def getPage(self,url):
r = Retriever(url)
retval = r.download()
if retval[0] == '*': # error situation,do not parse
print retval,'... skipping parse'
return
Crawler.count += 1
print '\n(',Crawler.count,')'
print 'URL:',url
print 'FILE:',retval[0]
self.seen.append(url)
links = r.parseAndGetLinks() #get and process links
for eachLink in links:
if eachLink[:4] != 'http' and \
find(eachLink,'://') == -1:
eachLink = urljoin(url,eachLink)
print '* ',eachLink,
if find(lower(eachLink),'mailto:') != -1:
print '... discarded,mailto link'
continue
if eachLink not in self.seen:
if find(eachLink,self.dom) == -1:
print '... discarded, not in domain'
else:
if eachLink not in self.q:
self.q.append(eachLink)
print '... new, added to Q'
else:
print '... discarded, already in Q'
else:
print '... discarded, already processed'
def go(self): # process links in queue
while self.q:
url = self.q.pop()
self.getPage(url)
def main():
if len(argv) > 1:
url = argv[1]
else:
try:
url = raw_input('Enter starting URL:')
except(KeyboardInterrupt,EOFError):
url = ''
if not url: return
robot = Crawler(url)
robot.go()
if __name__ == '__main__':
main()

I天辉I

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫