Get_Web_banner(批量获取网站banner)

环境:Python3

get_banner.py:

 

import chardet
import requests,re
from threading import Thread,activeCount
from sys import argv
from queue import Queue


requests.packages.urllib3.disable_warnings()
new_targets = []

def get_banner(url):
if 'http://' or 'https://' not in url.strip(): #判断有无协议
target = 'http://' + url.strip()
try:
req = requests.get(target,verify=False,allow_redirects=False,timeout=(5,20))
if 'charset' not in req.headers.get('Content-Type', " "):
req.encoding = chardet.detect(req.content).get('encoding') # 解决网页编码问题
code = req.status_code
if '30' in str(code):
if req.headers['Location'] == 'https://' + target.strip('http://') + '/':
req_30x = requests.get('https://{}'.format(target.strip('http://')),verify=False,timeout=(5,20))
code_30x = str(req_30x.status_code).strip()
if 'charset' not in req_30x.headers.get('Content-Type', " "):
req_30x.encoding = chardet.detect(req_30x.content).get('encoding') # 解决网页编码问题
try:
title_30x = re.findall(r'<title>(.*?)</title>',req_30x.text,re.S)[0].strip()
except:
title_30x = 'None'
if 'Server' in req_30x.headers:
server_30x = req_30x.headers['Server'].strip()
else:
server_30x = ''
if 'Content-Type' in req_30x.headers:
type_30x = req_30x.headers['Content-Type'].strip()
else:
type_30x = ''
if 'X-Powered-By' in req_30x.headers:
x_powered_by_30x = req_30x.headers['X-Powered-By'].strip()
else:
x_powered_by_30x = ''
print('[+] {} {} {} {} {} {} '.format(code_30x,target,title_30x,server_30x,type_30x,x_powered_by_30x))
write_info(target,code_30x,title_30x, server_30x, type_30x,x_powered_by_30x)
else:
title = '302_redirection'
location = req.headers['Location']
print('[+] {} {} {} Location:{}'.format(code,target,title,location))
write_info(target,code,title,location=location)
else:
try:
title = re.findall(r'<title>(.*?)</title>',req.text,re.S)[0].strip()
except:
title = 'None'
if 'Server' in req.headers:
server = req.headers['Server'].strip()
else:
server = ''
if 'Content-Type' in req.headers:
type = req.headers['Content-Type'].strip()
else:
type = ''
if 'X-Powered-By' in req.headers:
x_powered_by = req.headers['X-Powered-By'].strip()
else:
x_powered_by = ''
write_info(target,code,title,server,type,x_powered_by)
print('[+] {} {} {} {} {}'.format(code,target,title,server,x_powered_by))
except Exception as e:
print('[-]Error {} {} '.format(target,str(e)))


def write_info(url,code,title='',server='',type='',x_power_by='',location=''):
with open('websites_banner.txt','a+') as f:
f.write('{} {} {} {} {} {} \n'.format(code,url,title,server,type,x_power_by,location))

if __name__ == '__main__':
try:
queue = Queue()
filename = argv[1]
new_filename = argv[2]
with open(filename,'r+') as f:
for url in f:
url = url.strip()
if url not in new_targets:
new_targets.append(url)
for new_url in new_targets:
queue.put(new_url)
with open(new_filename,'a+') as f:
f.write(new_url + '\n')
while queue.qsize() > 0:
if activeCount() <= 10:
Thread(target=get_banner, args=(queue.get(),)).start()
except IndexError:
print('Usage:python3 get_banner.py urls.txt new_urls.txt')

使用方法:

1.在当目录下urls.txt中放入需要获取banner的url(可有http可无http)。

 

update:python3 get_banner.py urls.txt

 

2019.4.24 update:

1.对跳转进行优化,判断如果跳转后为https://加上原域名,则继续获取标题,并且添加上了headers。

2. Referer可根据爬取的情况自行修改。

2019.5.21 update:

加入多线程,修改使用方法。

 

Use:python3 get_banner.py urls.txt new_urls.txt

 

转载于:https://www.cnblogs.com/P1g3/p/10735233.html

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值