提取url特征

#!/usr/bin/env python
# encoding: utf-8 """
import urllib
import re
from bs4 import BeautifulSoup
from _socket import timeout
import os
import requests
import codecs
import sys
import string
from urllib.parse import urlparse
import json
import socket

#获取源代码
def crawlcontent(url):
    pattern1 = '<.*?(href=".*?").*?'
    headers = {'User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36'}
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    data = opener.open(url).read().decode('utf-8')
    # f = open("C:/Users/Administrator/PycharmProjects/URL/PhishTankURLs.html", "w+",encoding='utf-8')
    # f.write(data)
    # f.close()
    return data

def html(url):
    f = open("C:/Users/Administrator/PycharmProjects/URL/PhishTankURLs.html", "w+", encoding='utf-8')
    f.write(crawlcontent(url))
    f.close()
    p = open("C:/Users/Administrator/PycharmProjects/URL/PhishTankURLs.txt", "w+", encoding='utf-8')
    p.write(crawlcontent(url))

def geturl(url):
    soup = BeautifulSoup(crawlcontent(url), 'html.parser')
    for x in soup.findAll('a'):
        return x.attrs['href']

    html = urllib.request.urlopen(crawlcontent(url)).read()
    soup = BeautifulSoup(html)
    links = soup.findAll('a')
    return links



#提取主机名
def hostname(url):
    parsed_result = urlparse(url)
    a = parsed_result.netloc

    return(a)

#提取域名
def domain(url):
    host = hostname(url)
    domain = '.'.join(host.split('.')[-3:])
    return(domain)

#获取URL长度
def getLength(url):
    length = len(url)

    return length

#是否具有@符合
def aite(url):
    fuhao = '@'
    if fuhao in url:
        return 1
    else:
        return 0

#是否路径中含有域名
def pathhasdomin(url):
    d = hostname(url)
    parsed_result = urlparse(url)
    a = parsed_result.path
    if d in a:
        return 1
    else:
        return 0

#是否含有敏感词
def sensitiveword(url):
   i = 0
   flag = 0
   sensitive = ['secure', 'account', 'webscr', 'login', 'ebayiaphi', 'signin', 'banking', 'confirm']
   for i in range(len(sensitive)):
      if sensitive[i] in url:
         flag = 1
         break
      else:
         flag = 0
         continue
   if flag == 1:
      return 1
   else:
      return 0

#是否含有错误端口
def mistakePort(url):
    url = hostname(url)
    if ":" in url:
        return 1
    else:
        return 0

#获取自//之后的内容
def removeURLHeader(url):
    return url.split('//', 1)[-1]

#获取路径
def getPath(url):
    url = removeURLHeader(url)
    if '/' in url:

        startpos = url.index('/') + 1
        moweipos = url.rindex('/')
        suburl = url[startpos:moweipos]
        return suburl
    else:
        return ""

#是(1)否(0)
def brandname(url):#商标名
    i = 0
    flag = 0
    url = getPath(url)
    brandnamelist = ['53.com', 'Chase', 'Microsoft', 'ANZ', 'Citibank', 'Paypal', 'AOL', 'eBay', 'USBank', 'Banamex',
                     'E-Gold', 'Visa', 'Bankofamerica', 'Google', 'Warcraft', 'Barclays', 'HSBC', 'Westpac',
                     'battle.net', 'LIoyds', 'Yahoo']
    for i in range(len(brandnamelist)):
        if brandnamelist[i] in url:
            flag = 1
            break
        else:
            flag = 0
            continue
    if flag == 1:
        return 1
    else:
        return 0

#文件名
def getFile(url):
    startpos = url.rindex('/') + 1
    url = url[startpos:]

    return url

#二级域名
def getUrlSubDomain(url):
    host = hostname(url)
    subDomain = '.'.join(host.split('.',2)[1:-1])

    return subDomain

#含数字个数
def getDigitsCount(url):
    i = 0
    count = 0
    for i in range(len(url)):
        if url[i] in string.digits:
            count += 1

    return count

#大写字母个数
def getCountUpcase(url):
    i = 0
    count = 0
    for i in range(len(url)):
        if url[i] in string.ascii_uppercase:
            count += 1

    return count

#前缀个数
def getPrefixCount(url):
    prefix = ['_', '-']
    i = 0
    count = 0
    for i in range(len(url)):
        if url[i] in prefix:
            count += 1

    return count

#主机名项数
def termcout(url):
    url = hostname(url)
    url = url.split('.')
    return len(url)

#URL中数字-字符转换频次
def ZhuanHuanPingci(url):
    urlwhole = removeURLHeader(url)
    count = 0
    length = len(urlwhole)
    for i in range(length):
        if urlwhole[i] in string.digits and i + 1 < length and (
                urlwhole[i + 1] in string.ascii_lowercase or urlwhole[i + 1] in string.ascii_uppercase):

            count += 1
        else:
            if (urlwhole[i] in string.ascii_lowercase or urlwhole[i] in string.ascii_uppercase) and i + 1 < length and urlwhole[
                i + 1] in string.digits:
                count += 1

    return count

#是否含有关键词
def targetword(url):
    url = removeURLHeader(url)
    # startpos = url.index('/') + 1
    url = hostname(url)
    url = domain(url)
    # url = url[startpos:]
    normalDomain = ['paypal.com', 'aol.com', 'qq.com', 'made-in-china.com', 'google.com', 'facebook.com', 'yahoo.com',
                    'live.com', 'dropbox.com', 'wellsfargo.com', 'cmr.no', 'academia.edu', 'regions.com',
                    'shrinkthislink.com', 'maximumasp.com', 'popularenlinea.com', 'readydecks.com', 'meezanbank.com',
                    'vencorex.com', 'ketthealth.com', 'obhrmanager.com', 'bluehost.com', 'msubillings.edu',
                    'genxgame.com', 'gripeezoffer.com', 'bek-intern.de', 'ebay.com', 'chase.com', 'revoluza.com',
                    'dhl.com', 'flexispy.com', 'att.com', 'uwsp.edu', 'match.com', 'alnoorhospital.com', 'ourtime.com']
    if url in normalDomain:
        return 1
    else:
        return 0


if __name__ == '__main__':
    url = "http://www.baidu.com"
    content = []
    tezhen  = []
    tezhen.append({'主机名': hostname(url),'URL长度': getLength(url),'是否含有@符号(是(1)否(0))': aite(url),'路径中是否含有域名': pathhasdomin(url),'主机名长度': len(hostname(url)),'域名': domain(url),'是否含有敏感词': sensitiveword(url),'错误端口': mistakePort(url),'商标名称': brandname(url),'文件名': getFile(url),'获取二级域名': getUrlSubDomain(url),'含数字个数': getDigitsCount(url),'大写字母个数': getCountUpcase(url),'前缀个数': getPrefixCount(url),'主机名项数': termcout(url),'数字-字符转换频次': ZhuanHuanPingci(url),'是否含有关键词': targetword(url)})
    content.append({'URL': url,'特征': tezhen})
    with open('C:/Users/Administrator/PycharmProjects/URL/url特征.json', 'w') as fp:
        json.dump(content, fp=fp, indent=4, ensure_ascii=False, )

转载于:https://my.oschina.net/u/4088088/blog/3016327

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值