二开webalive(判断网站存活,方便后续去重)

用法:

python38 whichAlive.py -f url.txt -t 5  --proxy 127.0.0.1:8080

python38 whichalive.py -f url.txt  -t 5 -d --try-again

二开内容:

1.  增加了识别标题的准确性。

2. 当获取不到标题的时候,响应内容计算hash值保存到标题中。

whichAlive.py代码:

import argparse
import csv
import datetime
import hashlib
import os
import re
import socket
import time
import urllib
import urllib.parse
from concurrent.futures import ALL_COMPLETED, ThreadPoolExecutor, wait

import requests
import urllib3
from bs4 import BeautifulSoup
import chardet

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
DEBUG = False
TRYAGAIN = False

class whichAlive(object):
    def __init__(self, file, THREAD_POOL_SIZE=10, allow_redirect=False, PROXY={}):
        self.file = file
        self.filename = ''.join(file.split('/')[-1].split('.')[:-1])
        self.timenow = str(time.time()).split(".")[0]
        self.outfilename = f'{self.filename}{self.timenow}.csv'
        self.errorfilename = f'error_{self.filename}{self.timenow}.txt'
        self.urllist = self.__urlfromfile()
        self.tableheader = ['no', 'url', 'ip', 'state', 'state_code', 'title', 'server', 'length', 'other']
        self.HEADER = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
        }
        self.THREAD_POOL_SIZE = THREAD_POOL_SIZE
        self.allurlnumber = len(self.urllist)
        self.completedurl = -1
        self.allow_redirect = allow_redirect
        self.PROXY = PROXY

    def run(self):
        self.completedurl += 1
        self.__writetofile(self.tableheader)
        tasklist = []
        start_time = datetime.datetime.now()
        t = ThreadPoolExecutor(max_workers=self.THREAD_POOL_SIZE)
        for k, url in enumerate(self.urllist):
            tasklist.append(t.submit(self.__scan, url, k+1))
        print(f'total {self.allurlnumber}')
        if wait(tasklist, return_when=ALL_COMPLETED):
            end_time = datetime.datetime.now()
            print(f'--------------------------------\nDONE, use {(end_time - start_time).seconds} seconds')
            print(f'outfile: {os.path.join(os.path.abspath(os.path.dirname(__file__)), "result", self.outfilename)}')

    def __scan(self, url, no, tryagainflag=False):
        def callback(no, url, ip, state, state_code, title, server, length, other):
            self.completedurl += 1
            thisline = [no, url, ip, state, state_code, title, server, length, other]
            nowpercent = '%.2f'%((self.completedurl/self.allurlnumber)*100)
            print(f'[{nowpercent}%] {url} {ip} {state} {title} {length}')
            self.__writetofile(thisline)

        ip = ''
        state = ''
        state_code = -1
        title = ''
        server = ''
        length = -1
        other = ''
        
        try:
            if DEBUG: print(f'[+] {no} {url}')
            u = urllib.parse.urlparse(url)
            ip = self.__getwebip(u.netloc.split(':')[0])
            
            if self.allow_redirect:
                r = requests.get(url=url, headers=self.HEADER, timeout=15, verify=False, proxies=self.PROXY, allow_redirects=True)
                
                titles = [self.__getwebtitle(r)]  # 添加最后的响应标题
                lengths = [str(self.__getweblength(r))]
                servers = [self.__getwebserver(r)]
                
                for response in r.history:
                    titles.insert(0, self.__getwebtitle(response))
                    lengths.insert(0, str(self.__getweblength(response)))
                    servers.insert(0, self.__getwebserver(response))
                
                state = 'alive'
                state_code = '->'.join([str(i.status_code) for i in r.history] + [str(r.status_code)])
                title = '->'.join(titles)
                length = '->'.join(lengths)
                server = '->'.join(servers)
            else:
                r = requests.get(url=url, headers=self.HEADER, allow_redirects=False, timeout=15, verify=False, proxies=self.PROXY)
                state = 'alive'
                state_code = r.status_code
                title = self.__getwebtitle(r)
                length = self.__getweblength(r)
                server = self.__getwebserver(r)
            
            callback(no, url, ip, state, state_code, title, server, length, other)
        
        except requests.exceptions.ConnectTimeout as e:
            if DEBUG: print(f'[ConnectTimeout] {url} {e}')
            self.__errorreport(str(e))
            state = 'dead'
            callback(no, url, ip, state, state_code, title, server, length, 'ConnectTimeout')
        except requests.exceptions.ReadTimeout as e:
            if DEBUG: print(f'[ReadTimeout] {url} {e}')
            self.__errorreport(str(e))
            state = 'dead'
            callback(no, url, ip, state, state_code, title, server, length, 'ReadTimeout')
        except requests.exceptions.ConnectionError as e:
            if DEBUG: print(f'[ConnectionError] {url} {e}')
            self.__errorreport(str(e))
            state = 'dead'
            callback(no, url, ip, state, state_code, title, server, length, 'ConnectionError')
        except Exception as e:
            if DEBUG: print(f'[ERROR] {no} {url} {e}')
            self.__errorreport(str(e))
            if TRYAGAIN and not tryagainflag:
                self.__scan(url, no, True)
            callback(no, url, ip, state, state_code, title, server, length, 'e')

    def __getwebtitle(self, response):
        try:
            detected_encoding = chardet.detect(response.content)['encoding']
            if detected_encoding is None:
                detected_encoding = 'utf-8'  # 默认编码

            content = response.content.decode(detected_encoding, errors='replace')
            
            soup = BeautifulSoup(content, 'html.parser')
            title_tag = soup.find('title')
            
            if title_tag:
                return title_tag.get_text(strip=True)
            else:
                # 如果未找到标题,计算内容的哈希值并返回
                content_hash = hashlib.md5(content.encode(detected_encoding, errors='replace')).hexdigest()
                return f'Hash-{content_hash[:8]}'  # 返回哈希值的前8位作为标题
        except Exception as e:
            if DEBUG: print(f'[getwebtitle ERROR] {e}')
            # 在无法解码内容的情况下返回哈希值
            content_hash = hashlib.md5(response.content).hexdigest()
            return f'Hash-{content_hash[:8]}'

    def __getwebip(self, domain):
        try:
            ip = socket.getaddrinfo(domain, 'http')
            return ip[0][4][0]
        except:
            return ''

    def __getweblength(self, response):
        try:
            return len(response.content)
        except Exception as e:
            if DEBUG: print(f'[getweblength ERROR] {e}')
            return -1

    def __getwebserver(self, response):
        try:
            return response.headers.get('server') if response.headers.get('server') else ''
        except:
            return ''

    def __urlfromfile(self):
        with open(self.file, 'r') as f:
            return [i.strip() for i in f.readlines()]

    def __writetofile(self, data: list):
        with open(f'result/{self.outfilename}', 'a', newline='', encoding='utf-8') as f:
            writer = csv.writer(f)
            writer.writerow(data)

    def __errorreport(self, message):
        with open(f'error/{self.errorfilename}', 'a', encoding='utf-8') as f:
            f.write(message + '\n')


if __name__ == '__main__':
    parser = argparse.ArgumentParser(usage='whichAlive usage')
    parser.add_argument('-f', '--file', default='url.txt', help='URL lists file.')
    parser.add_argument('--proxy', default='', help='Set proxy, such as 127.0.0.1:8080')
    parser.add_argument('-t', '--thread', default=10, type=int, help='Set max threads, default 10')
    parser.add_argument('-d', '--debug', default=False, action='store_true', help='print some debug information')
    parser.add_argument('--try-again', default=False, action='store_true', help='If some error, try again scan that url once', dest='tryagain')
    args = parser.parse_args()

    DEBUG = args.debug
    TRYAGAIN = args.tryagain

    w = whichAlive(
        file=args.file,
        THREAD_POOL_SIZE=args.thread,
        allow_redirect=True,
        PROXY={'http': args.proxy, 'https': args.proxy}
    )
    w.run()

mysql数据库去重语句:使用weblive的结果

navicat 导入 --  编码选择第一个 系统默认编码 -- 分隔符选无。就能导入全部的weblive结果了。


1. 给每行插入序列id:

ALTER TABLE results2 ADD id INT(4) NOT NULL PRIMARY KEY AUTO_INCREMENT FIRST;


2. 通过`host`,`status`,`contentLength`,`banner`,`title` 字段删除重复,保留最大的id

DELETE FROM results2 WHERE id NOT IN (SELECT id  FROM (SELECT MAX(id) id FROM results2 GROUP BY `ip`,`state_code`,`title`,`Length`,`server`) cc);

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值