0x00 准备工作
- 补天账号
- python3运行环境
- requests等第三方库
0x01 流程分析
分别查看专属SRC、企业SRC、公益SRC对应URL,发现没有变化。初步判断网站使用的是 Ajax,即异步的 JavaScript 和 XML。
进入公益SRC,查看不同页码对应的URL,仍然没有变化。
随机选取一个厂商,点击提交漏洞。发现URL发生变化,且找到了我们想要爬取的厂商名称和域名。
分析不同厂商提交漏洞页面的URL,发现每一个厂商都有其对应的 cid,关键是如何获取这一参数。
index_url = 'https://www.butian.net/Loo/submit?cid={c_id}'
Ajax是利用JavaScript在保证页面不被刷新、页面链接不被改变的情况下,与服务器进行数据交换并更新部分网页内容的技术。它实现了前后端的分离,降低了服务器直接渲染页面带来的压力。解析流程如下:
- 发生请求:HttpXMLRequest
- 解析内容:请求得到响应,触发 onreadystatechange 属性对应方法,返回相关内容。返回的内容可能是 HTML,也可能是 Json。如果返回的内容是 Json 的话,我们可以通过 JavaScript 进一步处理,对它进行解析和转化。
- 渲染网页: 例如通过 docume.getElementById().innetHTML = XMLHttpRequest.responseText 对某元素内容进行更改,即DOM操作。
In a word,JavaScript 向服务器发送一个 Ajax 请求,请求得到响应后返回新的数据,数据通过 Ajax 加载,JavaScript 在后台调用 Ajax 函数接口得到数据,再对数据进行解析并渲染呈现。要想爬取页面信息,可以直接爬取 Ajax 接口获取数据。
0x02 脚本编写
一、基于页码,爬取厂商CID列表:
def get_company_id(data):
try:
reponse = requests.post('https://www.butian.net/Reward/pub',data=data,timeout=(4,20))
if reponse.status_code == 200:
return reponse.json()
logging.error('get invalid status_code %s while scraping %s',reponse.status_code,url)
except requests.RequestException:
logging.error('error occurred while scraping %s',url,exc_info=True)
def scrape_index_company_id(page):
data={
's': '1',
'p': page,
'token': ''
}
return get_company_id(data)
def scrape_parse_company_id(json):
company_list = json['data']['list']
page_num = json['data']['count']
return company_list,page_num
Ajax 的请求接口通常包含加密参数,如 token、sign 等,但此处 token 为空,故直接使用 requests 即可。当 token 不为空时,通常有两种方法解决:
- 深挖其中的逻辑,把 token 的逻辑完全找出来,再用 python 复现。
- 直接通过 selenium 模拟浏览器的方式绕过这个过程。
二、基于厂商CID,爬取厂商名称和域名:
def get_domain(url):
try:
reponse = requests.get(url,headers=headers)
if reponse.status_code == 200:
return reponse.text
logging.error('get invalid status_code %s while scraping %s',reponse.status_code,url)
except requests.RequestException:
logging.error('error occurred while scraping %s',url,exc_info=True)
def scrape_index_domain(company_id):
url = index_url.format(c_id=company_id)
return get_domain(url)
def scrape_parse_domain(html):
doc = pq(html)
name = doc('input#inputCompy.input-xlarge').attr('value')
try:
url = re.search('type="text" name="host"[^>]+value="([^\"]+)"',html).group(1)
except:
url = "www.null.com"
return name,url
三、保存厂商名称和域名:
def save_log(name,domain):
obj = open("target.text",'a+')
obj.write(name + "\t" + domain + "\n")
obj.close()
- r:只读,不创建(若文件不存在则报错)
- r+:覆盖读写,不创建(若文件不存在则报错)
- w:只写,新建(将原文件内容清空)
- w+:读写,新建(将原文件内容清空)
- a:附加写(若原文件不存在则创建)
- a+:附加读写(若原文件不存在则创建)
0x03 多线程
threads = []
for page in range(1,188):
time.sleep(10)
thread = threading.Thread(target=main,args=(page,))
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
0x04 多进程
pool = multiprocessing.Pool(4)
pages = range(1,188)
pool.map(main,pages)
pool.close()
pool.join()
0x05 The_Fu11_Scr1pt
import re
import sys
import json
import time
import logging
import requests
import argparse
import threading
import tldextract
from pyquery import PyQuery as pq
from multiprocessing import Pool,Lock
count = 0
logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s :%(message)s')
index_url = 'https://www.butian.net/Loo/submit?cid={c_id}'
def read_cookie():
return open('cookie.txt').read()
headers={
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'Connection': 'close',
'Cookie': read_cookie(),
'Host': 'www.butian.net',
'Referer': 'https://www.butian.net/Reward/plan/1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:92.0) Gecko/20100101 Firefox/92.0'
}
def get_company_id(data):
try:
reponse = requests.post('https://www.butian.net/Reward/pub',data=data,timeout=(4,20))
if reponse.status_code == 200:
return reponse.json()
logging.error('get invalid status_code %s while scraping %s',reponse.status_code,url)
except requests.RequestException:
logging.error('error occurred while scraping %s',url,exc_info=True)
def scrape_index_company_id(page):
data={
's': '1',
'p': page,
'token': ''
}
return get_company_id(data)
def scrape_parse_company_id(json):
company_list = json['data']['list']
page_num = json['data']['count']
return company_list,page_num
def get_domain(url):
try:
reponse = requests.get(url,headers=headers)
if reponse.status_code == 200:
return reponse.text
logging.error('get invalid status_code %s while scraping %s',reponse.status_code,url)
except requests.RequestException:
logging.error('error occurred while scraping %s',url,exc_info=True)
def scrape_index_domain(company_id):
url = index_url.format(c_id=company_id)
return get_domain(url)
def scrape_parse_domain(html):
doc = pq(html)
name = doc('input#inputCompy.input-xlarge').attr('value')
try:
url = re.search('type="text" name="host"[^>]+value="([^\"]+)"',html).group(1)
except:
url = "www.null.com"
return name,url
def save_log(name,domain):
obj = open("target.txt", 'a+')
obj.write(name + "\t" + domain + "\n")
obj.close()
def Normal():
page = 0
global count
while True:
page = page +1
json = scrape_index_company_id(page)
company_list,page_num = scrape_parse_company_id(json)
print ("\033[0;32;40m[+]Scraping page: \033[0m" + str(page) + "\t \033[0;32;40mTotal number: \033[0m" + str(len(company_list)))
for company in company_list:
count = count + 1
time.sleep(1)
company_id = company['company_id']
html = scrape_index_domain(company_id)
name,url = scrape_parse_domain(html)
domain = tldextract.extract(url).registered_domain
save_log(name,domain)
print ("[+]Saving company ID: " + str(company_id) + "\t Name: " + name + "\t Domain: " + domain + "\t Count: " + str(count))
if page >= page_num:
break
def Multi_process(page):
time.sleep(3)
lock = threading.Lock()
json = scrape_index_company_id(page)
company_list,page_num = scrape_parse_company_id(json)
lock.acquire()
print ("\033[0;32;40m[+]Scraping page: \033[0m" + str(page) + "\t \033[0;32;40mTotal number: \033[0m" + str(len(company_list)))
lock.release()
for company in company_list:
time.sleep(3)
company_id = company['company_id']
html = scrape_index_domain(company_id)
name,url = scrape_parse_domain(html)
domain = tldextract.extract(url).registered_domain
save_log(name,domain)
lock.acquire()
print ("[+]Saving company ID: " + str(company_id) + "\t URL: " + url + "\t Domain: " + domain)
lock.release()
def Multi_thread(page):
time.sleep(8)
global count
lock = threading.Lock()
print(f'Threading {threading.current_thread().name} is running')
json = scrape_index_company_id(page)
company_list,page_num = scrape_parse_company_id(json)
print ("\033[0;32;40m[+]Scraping page: \033[0m" + str(page) + "\t \033[0;32;40mTotal number: \033[0m" + str(len(company_list)))
for company in company_list:
lock.acquire()
count = count + 1
lock.release()
time.sleep(8)
company_id = company['company_id']
html = scrape_index_domain(company_id)
name,url = scrape_parse_domain(html)
domain = tldextract.extract(url).registered_domain
save_log(name,domain)
print ("[+]Saving company ID: " + str(company_id) + "\t URL: " + url + "\t Domain: " + domain + "\t Count: " + str(count))
print(f'Threading {threading.current_thread().name} is ended')
def parser_error(errmsg):
print("Usage: python " + sys.argv[0] + " [Options] use -h for help")
sys.exit()
def parse_args():
parser = argparse.ArgumentParser(epilog="\tExample: \r\npython " + sys.argv[0] + " -t")
parser.error = parser_error
parser._optionals.title = "OPTIONS"
parser.add_argument('-t', '--type', help='Normal or Multi-process or Multi-thread. Normal :1 , Multi-process:2 , Multi-thread:3', default="1",required=False)
return parser.parse_args()
if __name__ == '__main__':
args = parse_args()
type = args.type
if type=="1":
Normal()
elif type=="2":
pages = range(1,188)
pool = Pool(4)
pool.map(Multi_process,pages)
pool.close()
pool.join()
elif type=="3":
threads = []
for page in range(1,188):
time.sleep(6)
thread = threading.Thread(target=Multi_thread,args=(page,))
thread.start()
threads.append(thread)
for thread in threads:
thread.join()
else:
print("invalid type")
食用指北:python get_butian.py -h
=> python get_butian.py -t 1/2/3
0x06 Extension
一、反屏蔽:
from selenium import webdriver
from selenium.webdriver import ChromeOptions
option = ChromeOptions()
option.add_experimental_option('excludeSwitchese',['enable-automation'])
option.add_experimental_option('useAutomationExtension',False)
browser = webdriver.Chrome(options=option)
browser.execute_cdp_cmd('Page.addScriptToEvaluateOnNewdocument',{'source':'Object.defineProperty(navigator,"webdriver",{get:()=>undefined})'})
二、无头模式:
from selenium import webdriver
from selenium import ChromeOptions
option = ChromeOptions()
option.add_argument('--headless')
browser = webdriver.Chrome(options=option)
browser.set_window_size(1366,768)
三、显式等待:
from selenium.webdriver.commom.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
wait = WebDriverWait(browser,10)
def scrape_page(url,condition,locator):
四、隐式等待
browser.implicitly_wait(10)
0x07 Extension_Script
进入公益SRC页面:
def scrape_page(url,condition,locator):
try:
browser.get(url)
wait.until(condition(locator))
except TimeoutException:
logging.error('error occurred while sraping %s',url,exc_info=True)
def scrape_index():
href = '/Reward/plan/1'
url = urljoin(base_url,href)
scrape_page(url,condition=EC.element_to_be_clickable,locator=(By.LINK_TEXT,'公益SRC'))
def parse_index():
button = browser.find_element_by_link_text('公益SRC')
button.click()
实现翻页:
def next_page():
next = browser.find_element_by_class_name('next')
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'a.btn')))
next.click()
基于翻页,爬取厂商提交漏洞页面的URL:
def main():
scrape_index()
parse_index()
while True:
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'a.btn')))
elements = browser.find_elements_by_css_selector('a.btn')
for element in elements:
href = element.get_attribute('href')
print(href)
time.sleep(5)
next_page()
基于URL,爬取厂商名称和域名:
print(href) => get_domain(href)
The_fu11_extensi0n_scr1pt:
import time
import logging
from selenium import webdriver
from urllib.parse import urljoin
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
browser = webdriver.Chrome()
wait = WebDriverWait(browser,10)
base_url = 'https://www.butian.net'
logging.basicConfig(level=logging.INFO,format='%(asctime)s-%(levelname)s:%(message)s')
def scrape_page(url,condition,locator):
try:
browser.get(url)
wait.until(condition(locator))
except TimeoutException:
logging.error('error occurred while sraping %s',url,exc_info=True)
def scrape_index():
href = '/Reward/plan/1'
url = urljoin(base_url,href)
scrape_page(url,condition=EC.element_to_be_clickable,locator=(By.LINK_TEXT,'公益SRC'))
def parse_index():
button = browser.find_element_by_link_text('公益SRC')
button.click()
def next_page():
next = browser.find_element_by_class_name('next')
try:
next.click()
wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR,'a.btn')))
except Exceptions as e:
print("Exception found", format(e))
def main():
scrape_index()
parse_index()
while True:
elements = browser.find_elements_by_css_selector('a.btn')
for element in elements:
href = element.get_attribute('href')
print(href)
time.sleep(5)
next_page()
if __name__=='__main__':
main()
此部分作为拓展,当 token 不为空时,可以将该脚本与前面的 get_domain、scrape_parse_domain、save_log 等函数进行串联,从而实现对厂商名称和域名的爬取。
0x08 Summary
Informaintion garthering is the first and most critical step.