#coding:utf-8 import sys,urllib2,re,Queue sys.path.append("..") from lib.Http_Class import Http_Class from BeautifulSoup import BeautifulSoup #################################### # # Spider 爬虫模块 # #################################### class Spider_module: def setW3AScan(self,w3ascan): self.w3ascan=w3ascan self.result_list={} self.q_list=Queue.Queue() self.tmp_list=Queue.Queue() def start(self,aa): url="http://lucifr.com/" print "[*] 爬虫目标:"+url self.result_list.update({url:0}) try: while True: # 判断爬虫是否有爬过 for url in self.result_list: if self.result_list[url]==0: self.q_list.put(url) self.result_list[url]=1 # 判断任务队列是否为空,如果是则直接退出 # 否则处理任务 if self.q_list.empty(): print "[*] 结束爬虫任务." break else: for tmp in range(self.q_list.qsize()): spider_url=self.q_list.get() obj=Http_Class() try: html=obj._do("get",spider_url) except: self.w3ascan.log_create("url: %s Field! " % spider_url,"Spider_module") print "url: %s Field! " % spider_url continue soup=BeautifulSoup(html) links=soup.findAll('a') for link in links: _url=link.get('href').encode('utf-8') if re.match('^(javascript|:;|#|mailto)',_url) or _url is None or re.match('.(jpg|png|bmp|mp3|wma|wmv|gz|zip|rar|iso|pdf|txt|db)$',_url): continue if re.match('^(http|https)',_url): if not re.match('^'+url,_url): continue else: if self.result_list.has_key(url): continue else: rst=_url.encode('utf-8') print "[*][!] 发现新连接: "+rst self.result_list.update({rst:0}) else: if self.result_list.has_key(url+_url): continue else: rst=url+_url print "[*][!] 发现新连接: "+rst.encode('utf-8') self.result_list.update({rst.encode('utf-8'):0}) except Exception,error: print "[*] 发生异常情况,捕获并写入日志。" self.w3ascan.log_create("Url: %s get Url Error! Source: %s" % (url,error),"Spider_module") def save(self): print "[*]保存爬虫结果" def getPluginClass(): return Spider_module if __name__=="__main__": t=Spider_module() t.start("aaa")