前言
最近在写代码,涉及了web爬取链接的方面,在百度过程中了解到了这篇文章:superSpider,突然就好奇平时常见的爬虫 工具和扫描器里的爬虫模块能力如何,所以来测试下。
主要测试1个自己手写的瞎眼爬虫,还有crawlergo、rad、burpsuite pro v202012、awvs 2019
文章里推荐的http://demo.aisec.cn/demo/站点打不开了所以就使用awvs的了;
测试站点:http://testphp.vulnweb.com
一 手写的基准爬虫
只抓取a标签下的href和script标签下的src;
from urllib.parse import urlparse,urljoin
from bs4 import BeautifulSoup
import requests
import validators
from queue import Queue
import threading
requests.packages.urllib3.disable_warnings()
class jsfinder():
def __init__(self,url,cookie=""):
self.baseUrl = self.return_entire_url(url)
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36",
"cookie": cookie}
self.q = Queue()
self.crawed_list = set()
self.urlList = []
self.q.put(url)
self.spider_status = 1
def return_entire_url(self,url):
if url is not None:
if url.startswith('http') or urlparse(url).scheme:
return url.strip()
else:
if self.baseUrl == "":
self.baseUrl = "http://" + url
print(self.baseUrl)
return urljoin(self.baseUrl,url.strip())
else:
pass
def spider(self):
while(not self.q.empty() or self.spider_status):
url = self.q.get()
if url in self.crawed_list :
continue
print("requesting:",url)
try:
resp = requests.get(url=url, headers=self.headers, timeout=5, verify=False)
self.htmlParse(resp)
self.crawed_list.add(url)
except:
print("requests error:",url)
if self.spider_status == 1:
time.sleep(5)
self.spider_status = 0
print(self.q.qsize())
def htmlParse(self,response):
tempList = []
blacklist = ['#',None,'javascript:']
soup = BeautifulSoup(response.text.encode('utf-8'), 'html.parser')
for href in soup.find_all('a'):
#print(self.urlParse(href.get('href')))
tempList.append(href.get('href'))
for href in soup.find_all('script'):
#print(self.urlParse(href.get('src')))
tempList.append(href.get('src'))
tempList = list(set(tempList)-set(blacklist))
for i in tempList:
url = self.return_entire_url(i)
if validators.url(url):
print("get:",url)
#print(i,self.return_entire_url(i))
if url not in self.crawed_list :
self.urlList.append(url)
if urlparse(url).netloc in self.baseUrl:
self.q.put(url)
if __name__ == "__main__":
A = jsfinder("http://testphp.vulnweb.com")
t = threading.Thread(target=A.spider)
t.start()
t.join()
for i in list(set(A.urlList)):
print(i)
结果:
46个链接,夹杂着很多其他域名的链接,有很多带参数的链接
http://testphp.vulnweb.com/product.php?pic=3
http://testphp.vulnweb.com/cart.php
https://www.acunetix.com/blog/articles/prevent-sql-injection-vulnerabilities-in-php-applications/
http://testphp.vulnweb.com/hpp/
http://testphp.vulnweb.com/product.php?pic=7
http://testphp.vulnweb.com/guestbook.php
http://testphp.vulnweb.com/listproducts.php?cat=2
http://testphp.vulnweb.com/Details/network-attached-storage-dlink/1/
http://testphp.vulnweb.com/categories.php
http://testphp.vulnweb.com/artists.php
http://www.eclectasy.com/Fractal-Explorer/index.html
http://testphp.vulnweb.com/artists.php?artist=1
http://testphp.vulnweb.com/showimage.php?file=./pictures/5.jpg
http://testphp.vulnweb.com/showimage.php?file=./pictures/4.jpg
http://testphp.vulnweb.com/listproducts.php?artist=1
http://testphp.vulnweb.com/product.php?pic=1
http://testphp.vulnweb.com/showimage.php?file=./pictures/7.jpg
http://testphp.vulnweb.com/userinfo.php
http://testphp.vulnweb.com/product.php?pic=5
http://testphp.vulnweb.com/listproducts.php?artist=3
http://www.acunetix.com
http://testphp.vulnweb.com/showimage.php?file=./pictures/2.jpg
http://testphp.vulnweb.com/Details/color-printer/3/
http://testphp.vulnweb.com/listproducts.php?artist=2
http://testphp.vulnweb.com/disclaimer.php
http://testphp.vulnweb.com/login.php
http://testphp.vulnweb.com/listproducts.php?cat=1
http://testphp.vulnweb.com/artists.php?artist=2
http://testphp.vulnweb.com/showimage.php?file=./pictures/1.jpg
http://testphp.vulnweb.com/Details/web-camera-a4tech/2/
https://www.acunetix.com/vulnerability-scanner/php-security-scanner/
http://testphp.vulnweb.com/listproducts.php?cat=4
http://testphp.vulnweb.com/privacy.php
http://testphp.vulnweb.com/AJAX/index.php
http://testphp.vulnweb.com/listproducts.php?cat=3
https://www.acunetix.com/vulnerability-scanner/
http://testphp.vulnweb.com/signup.php
http://testphp.vulnweb.com/product.php?pic=2
http://testphp.vulnweb.com/showimage.php?file=./pictures/3.jpg
https://www.acunetix.com/
http://testphp.vulnweb.com/index.php
http://testphp.vulnweb.com?pp=12
http://testphp.vulnweb.com/Mod_Rewrite_Shop/
http://testphp.vulnweb.com/artists.php?artist=3
http://blog.mindedsecurity.com/2009/05/client-side-http-parameter-pollution.html
http://testphp.vulnweb.com/product.php?pic=4
二 crawlergo爬取
在官方示例代码上加了几行
#!/usr/bin/python3
# coding: utf-8
import simplejson
import subprocess
def main():
target = "http://testphp.vulnweb.com/"
cmd = ["/home/loser/MySimpleScanner-master-v2/tools/crawlergo", "-c", "/usr/bin/google-chrome", "-o", "json", target]
rsp = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
output, error = rsp.communicate()
# "--[Mission Complete]--" 是任务结束的分隔字符串
result = simplejson.loads(output.decode().split("--[Mission Complete]--")[1])
req_list = result["req_list"]
for req in req_list:
print(req)
#print(req_list[0])
if __name__ == '__main__':
main()
结果:
48条
{'url': 'http://testphp.vulnweb.com/', 'method': 'GET', 'headers': {'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'Target'}
{'url': 'https://testphp.vulnweb.com/', 'method': 'GET', 'headers': {'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'Target'}
{'url': 'http://testphp.vulnweb.com/artists.php', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/index.php', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/categories.php', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/disclaimer.php', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/guestbook.php', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/AJAX/index.php', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/cart.php', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/login.php', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/userinfo.php', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/privacy.php', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/hpp/', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/Mod_Rewrite_Shop/', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/search.php?test=query', 'method': 'POST', 'headers': {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Content-Type': 'application/x-www-form-urlencoded', 'Origin': 'http://testphp.vulnweb.com', 'Referer': 'http://testphp.vulnweb.com/', 'Spider-Name': 'crawlergo', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': 'searchFor=Crawlergo', 'source': 'XHR'}
{'url': 'http://testphp.vulnweb.com/search.php?test=query', 'method': 'POST', 'headers': {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Content-Type': 'application/x-www-form-urlencoded', 'Origin': 'http://testphp.vulnweb.com', 'Referer': 'http://testphp.vulnweb.com/', 'Spider-Name': 'crawlergo', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': 'searchFor=Crawlergo&goButton=go', 'source': 'XHR'}
{'url': 'http://testphp.vulnweb.com/signup.php', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/login.php', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/userinfo.php', 'method': 'POST', 'headers': {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'Content-Type': 'application/x-www-form-urlencoded', 'Origin': 'http://testphp.vulnweb.com', 'Referer': 'http://testphp.vulnweb.com/login.php', 'Spider-Name': 'crawlergo', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': 'uname=crawlergo%40gmail.com&pass=Crawlergo6.', 'source': 'XHR'}
{'url': 'http://testphp.vulnweb.com/listproducts.php?cat=1', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/categories.php', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/artists.php?artist=1', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/artists.php', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'DOM'}
{'url': 'http://testphp.vulnweb.com/comment.php?aid=1', 'method': 'GET', 'headers': {'Referer': 'http://testphp.vulnweb.com/artists.php', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'OpenWindow'}
{'url': 'http://testphp.vulnweb.com/AJAX/artists.php', 'method': 'GET', 'headers': {'Accept': '*/*', 'Referer': 'http://testphp.vulnweb.com/AJAX/index.php', 'Spider-Name': 'crawlergo', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.0 Safari/537.36'}, 'data': '', 'source': 'XHR'}
{'url': 'http://testphp.vulnweb.com/AJAX/categories.php', 'method': 'GET&#