基于URL的轻量级恶意页面检测分类器

    本文是整理之前研究工作中的相关代码,这里留作后期学习使用,恶意页面检测已经有很多相关的研究工作,本文是基于一些论文中的思路进行了,具体不再多说明了,放上代码。

# !/usr/bin/python
#-*-coding:utf-8-*-

'''
__Author__:沂水寒城
功能:提取url的特征来训练机器学习模型
'''
import codecs
import sys
import re
import chardet
import warnings
from lxml import etree
from os.path import join as pjoin
import os
from bs4 import BeautifulSoup
from urllib2 import urlparse
from urlparse import urlparse
import codecs
from publicsuffix import PublicSuffixList
from urllib import splitport
import re

RED_KEYWORDS=["account", "admin", "administrator","auth", "bank", "client", "confirm", "email", 
              "host","password", "pay", "private", "safe", "secure", "security", "sign", "user", 
              "validation", "verification", "icbc"]
PATH_KEYWORDS = ["www", "net", "com", "cn"]


def url_split(url):
    '''
    url分割
    '''
    if not url.startswith('http'): 
        if not url.startswith('ftp'):
            url = 'http://' + url
    parts = urlparse(url)
    server, host, top, hostname = domain_split(parts.netloc)
    host, port = splitport(host)
    if port == None: port = ''
    return {'scheme': parts.scheme, 'server': server, 'host': host,
            'port': port, 'top': top, 'path': parts.path, 'domain': host + top,
            'params': parts.params, 'query': parts.query, 'fragment': parts.fragment}


def url_split_new(url):
    '''
    url分割
    '''
    if not url.startswith('http'):  
        url = 'http://' + url
    parts = urlparse(url)
    server, host, top, hostname = domain_split(parts.netloc)
    host, port = splitport(host)
    if port == None: port = ''
    return {'protocol': parts.scheme, 'hostname': hostname, 'path': parts.path}


#URL to Path Features
def len_procotol2path_os(url):
    result = url_split_new(url.strip())
    return len(os.path.split(url)[0])


def len_procotol2path():
    #result = url_split_new(url.strip())
    string = new_result_dict['protocol'] + '://' + new_result_dict['hostname']
    return len(string)


def ip_exist(url):
    compile_rule = re.compile(r'\d+.\d+.\d+.\d+')
    result_list = re.findall(compile_rule, url)
    for num_string in result_list:
        if num_string in url:
            return 1
        else:
            return 0

#Hostname Features
def len_hostname():
    #result = url_split_new(url.strip())
    return len(new_result_dict['hostname'])


def www_exist(url):
    if 'www' in url:
        return 0
    else:
        return 1


def TLD_exist():
    #result = url_split(url.strip())
    Hostname = old_result_dict['domain']
    Hostname_list = Hostname.split('.')
    if len(Hostname_list) >= 3:
        return 1
    else :
        return 0


def SLD_decimal():
    #result = url_split(url.strip())
    Hostname = old_result_dict['domain']
    Hostname_list = Hostname.split('.')
    SLD = Hostname_list[-2]
    compile_rule = re.compile(r'\d+')
    sld_list = re.findall(compile_rule, SLD)
    if len(sld_list) !=0:
        return 1
    else:
        return 0


def TLD_decimal():
    #result = url_split(url.strip())
    Hostname = old_result_dict['domain']
    Hostname_list = Hostname.split('.')
    if len(Hostname_list) >= 3:
        TLD = Hostname_list[-3]
        compile_rule = re.compile(r'\d+')
        tld_list = re.findall(compile_rule, SLD)
        if len(tld_list) !=0:
            return 1
    else:
        return 0


def TLD_hex(url):
    result = url_split(url.strip())
    Hostname = result['domain']
    Hostname_list = Hostname.split('.')
    if len(Hostname_list) >= 3:
        TLD = Hostname_list[-3]
        compile_rule = re.compile(r'[0x0000-0xFFFF]+')
        tld_list = re.findall(compile_rule, TLD)
        if len(tld_list) != 0:
            return 1
    else :
        return 0


def len_path():
    #result = url_split_new(url.strip())
    return len(new_result_dict['path'])


def num_directory_max():
    #result = url_split_new(url.strip())
    if new_result_dict['path'] = '/':
        return [0,0]
    else:
        num_directory_list = new_result_dict['path'][1:].split('/')
        return [len(num_directory_list), max([len(d) for d in num_directory_list])]

def path_exist_date():
    #result = url_split_new(url.strip())
    path = new_result_dict['path']
    compile_rule = re.compile(r'\d+-\d+-\d+')
    date_list = re.findall(compile_rule, path)
    if len(date_list) != 0:
        return 1
    else:
        return 0


def path_exist_hex():
    #result = url_split(url.strip())
    path = old_result_dict['path']
    compile_rule = re.compile(r'[0x0000-0xFFFF]+')
    path_list = re.findall(compile_rule, path)
    if len(path_list) != 0:
        return 1
    else :
        return 0


def geturlat(url):
    '''
    判断URL中是否含有@,?,-,_等符号
    '''
    re_script = re.compile(r'@|-|_|\?|~')
    return 1 if re_script.search(url) else 0


def geturldot(url):
    '''
    判断URL中.的个数
    '''
    dotnum = 0
    for u in url:
        if u == '.':
            dotnum += 1
    return dotnum


def get_url_length(url):
    '''
    获得URL的总长度
    '''
    return len(url)


def get_url_number_length(url):
    '''
    获得URL的最长的数字串长度
    '''
    result = 0
    match = re.findall(r"\d+", url)
    if match:
        match.sort(key=lambda x: len(x), reverse=True)
        result = len(match[0])
    return result


def get_red_keyword(url):
    '''
    判断URL中是否包含敏感词汇
    '''
    url = url.lower()
    for key in RED_KEYWORDS:
        if url.find(key) != -1:
            return 1
    return 0


def get_path_key(url):
    '''
    判断URL的路径中是否包含敏感词汇 
    '''
    url_parse = urlparse(url)
    path = url_parse.path
    if path:
        for key in PATH_KEYWORDS:
            if path.lower().find(key) != -1:
                return 1
    return 0


def get_url_vector(url):
    '''
    获取url的全部特征
    '''
    heuristic_vector=[]
    heuristic_vector.append(geturlat(url))
    heuristic_vector.append(geturldot(url))
    heuristic_vector.append(get_url_length(url))
    heuristic_vector.append(get_url_number_length(url))
    heuristic_vector.append(get_red_keyword(url))
    return heuristic_vector


if __name__ == '__main__':
    test_url='http://www.baidu.com'
    print get_url_vector(test_url)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Together_CZ

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值