1:
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#filename:splider.py
#author:wfu(fuweilin@hotmail.com)
from spdUtility import PriorityQueue,Parser
import urllib2
import sys
import os
def updatePriQueue( priQueue, url ):
"更新优先级队列"
extraPrior = url.endswith('.html') and 2 or 0 #这里优先下载以html结尾的url
extraMyBlog = 'www.kgblog.net' in url and 5 or 0 #优先抓取含有指定内容的网页,竞价抓取排名??
item = priQueue.getitem(url)
if item :
newitem = ( item[0]+1+extraPrior+extraMyBlog, item[1] )
priQueue.remove(item)
priQueue.push( newitem )
else :
priQueue.push( (1+extraPrior+extraMyBlog,url) )
def getmainurl(url):
"获得该url的主站地址,用于添加在相对url地址的开头"
ix = url.find('/',len('http://') )
if ix > 0 :
return url[:ix]
else :
return url
def analyseHtml(url,html, priQueue,downlist):
"分析html的超链接,并更新优先级队列"
p = Parser()
try :
p.feed(html)
p.close()
except:
return
mainurl = getmainurl(url)
for k, v in p.anchors.items():
for u in v :
if not u.startswith('http://'): #处理相对地址的url
u = mainurl + u
if not downlist.count(u) : #如果该url已经下载,就不处理了
updatePriQueue( priQueue, u )
def downloadUrl(id, url, priQueue , downlist,downFolder):
"下载指定url内容,并分析html超链接"
downFileName = downFolder+'/%d.html' % (id,)
print 'downloading',url,'as', downFileName ,
try:
fp = urllib2.urlopen(url)
except:
print '[ failed ]'
return False
else :
print '[ success ]'
downlist.push( url ) #把已下载的url添加到列表中
op = open(downFileName,"wb")
html = fp.read()
unicode(html,"gb18030","ignore").encode("utf8");
op.write( html )
op.close()
fp.close()
analyseHtml(url,html,priQueue,downlist)
return True
def spider(beginurl, pages,downFolder):
"爬虫主程序,循环从优先级队列中取出最高优先级的结点处理"
priQueue = PriorityQueue()
downlist = PriorityQueue() #已下载url的集合,防止重复下载
priQueue.push( (1,beginurl) )
i = 0
while not priQueue.empty() and i < pages :
k, url = priQueue.pop()
if downloadUrl(i+1, url, priQueue , downlist,downFolder):
i += 1
print '\nDownload',i,'pages, Totally.'
def main():
"主函数,设定相关参数:开始url,抓取的网页数目,保存的文件夹"
beginurl = 'http://www.csdn.net' #开始抓取的URL地址
pages = 10 #抓取网页的数目
downloadFolder = './down' #指定保存网页的文件夹
if not os.path.isdir( downloadFolder ):
os.mkdir( downloadFolder )
spider( beginurl, pages, downloadFolder)
if __name__ == '__main__':
main()
2:
#! /usr/bin/env python
# -*- coding: utf-8 -*-
#filename:spdUtility.py
#author:wfu(fuweilin@hotmail.com)
import bisect
import string
import htmllib
import formatter
class PriorityQueue(list):
"优先级队列,用于存储url,及它的优先级"
def __init__(self):
list.__init__(self)
self.map = {}
def push(self, item):
# 按顺序插入,防止重复元素;若要按升序排列,可使用bisect.insort_left
if self.count(item) == 0:
bisect.insort(self, item)
self.map[ item[1] ] = item
def pop(self):
r = list.pop(self)
del self.map[ r[1] ]
return r
def getitem(self,url):
if self.map.has_key( url ):
return self.map[url]
else :
return None
def empty(self):
return len(self) == 0
def remove(self,item):
list.remove(self, item)
del self.map[ item[1] ]
def count(self,item):
if len(self) == 0 :
return 0
#二分查找
left = 0
right = len(self)-1
mid = -1
while left <= right:
mid = (left+right)/2
if self[mid] < item :
left = mid + 1
elif self[mid] > item :
right = mid -1
else :
break
return self[mid] == item and 1 or 0
class Parser(htmllib.HTMLParser):
#HTML分析类
def __init__(self, verbose=0):
self.anchors = {}
f = formatter.NullFormatter()
htmllib.HTMLParser.__init__(self, f, verbose)
def anchor_bgn(self, href, name, type):
self.save_bgn()
self.anchor = href
def anchor_end(self):
text = string.strip(self.save_end())
if self.anchor and text:
self.anchors[text] = self.anchors.get(text, []) + [self.anchor]
def main(): #just for test
pq = PriorityQueue()
# add items out of order
pq.push( (1,'http://www.baidu.com') )
pq.push( (2,'http://www.sina.com') )
pq.push( (3,'http://www.google.com') )
pq.push( (1,'http://www.163.com') )
item = pq.getitem('http://www.sina.com')
print item
print pq.count(item)
pq.remove( item )
print pq.count(item)
# print queue contents
while not pq.empty():
print pq.pop()
if __name__ == '__main__':
main()