子域名搜集

最新推荐文章于 2024-07-16 12:00:31 发布

weixin_34337265

最新推荐文章于 2024-07-16 12:00:31 发布

阅读量246

点赞数

文章标签：爬虫 python xhtml

原文链接：http://www.cnblogs.com/AtesetEnginner/p/11005126.html

版权

#爬虫（使用搜索引擎|如baidu）
#coding=utf-8
import re
import requests
sites = []
for i in range(0,10):   #10页为例
    i = i*10
    url = 'https://www.baidu.com/s?wd=site:xxx.com.cn&pn=%s' %i     #设定url请求
    response = requests.get(url).content   #get请求，content获取返回包正文
    baidudomain = re.findall('style="text-decoration:none;">(.*?)/',response)
    sites += list(baidudomain)
site = list(set(sites))  #set()实现去重
print site
print "\nThe number of sites is %d" %len(site)
for i in site:
    print i

#使用Python模块

from __future__ import division
import lxml
from lxml.html import fromstring
import requests
import re
import mechanize
import operator
import sys
import os
from time import sleep

class SameFileError(Exception): pass
class NoneTypeError(Exception): pass


global formlist
reqlist = []
feature_hub = []
_Root = os.getcwd()

def _requestData(url):
    headers = {
        'Connection': 'keep-alive',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:55.0) Gecko/20100101 Firefox/55.0',
        'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'Accept-Encoding': 'gzip, deflate',
        # 'Cookie': '',
        'Upgrade-Insecure-Requests': '1'
    }
    try:
        req = requests.get(url, headers=headers,timeout=5)
    except:
        return 'err ', url, None
    return req.status_code, url, req.text


def getLinks(self):
    try:
        resType, resHost, resData = _requestData(self)
        if not resData:
            raise NoneTypeError
        doc = lxml.html.document_fromstring(resData)
        tags = ['a', 'iframe', 'frame']
        doc.make_links_absolute(resHost)
    except Exception,NoneTypeError:
        return resHost, None
    links = doc.iterlinks()
    trueLinks = []
    for l in links:
        if l[0].tag in tags:
            trueLinks.append(l[2])
    return trueLinks, resData  # 要确保是绝对路径


def correct_url(url):
    if 'http://' not in url:
        url = 'http://' + url.strip()
    return url


def middle_name(url):
    # middle_name = re.findall(r'[\/\.]([\s\S]+)\.', url)
    # tidy the url
    url_tidy = url.strip('www.')
    url_tidy = url_tidy.strip('http://')
    url_tidy = url_tidy.strip('https://')
    # dot = re.findall('\.', url_tidy)
    re_url = re.compile(r'([-\w]+).')
    try:
        middle = re_url.match(url_tidy).groups(0)
    except Exception:
        return None
    return middle[0]

def getdiffer(list1,list2):
    if len(list1)<len(list2):
        length = len(list1)
        if (len(list2)-len(list1))>5:
            return False
    else:
        length = len(list2)
        if (len(list1)-len(list2))>5:
            return False
    return length

def str_compare(str1,str2,accuracy=0.80):
    list1 = list(str1)
    list2 = list(str2)
    score = 0
    # print "comparing:",str1,str2
    total = len(list1)
    length = getdiffer(list1,list2)
    if length is False:return False
    for i in xrange(length):
        if list1[i] == list2[i]:
            score += 1
    ratio = score/total
    if ratio > accuracy:
        # print "similier"
        return True
    return False


def feature_match(link):
   global url_old
   for link_old in url_old:
      if str_compare(link_old,link):
         return True
   return False


def feature_catch(link):
    pass


def feature_filter(link):
    # 检测是否匹配已有特征
    if feature_match(link):
        return True
    return False


def single_url(url):
# 获取单一url入口
    # try:
    global url_ad
    global url_old
    global middle
    url = correct_url(url)
    # 获取页面上链接、数据
    url_links, data = getLinks(url)
    if data is None:
        return
    for link in url_links:
        sys.stdout.write('!')
        if link == url:
            continue
        if link in url + '/index':
            continue
        if 'javascript' in link:
            continue
        if link in url_old:
            continue
        if middle not in link:
            continue
        if feature_filter(link):
            continue
        try:
            print "\n",link
        except Exception:
            pass
        with open(_Root + "\\Results\\" + middle + "_links.txt","a") as f:

            f.write(link+"\n")
        # if link not in url_old and link not in url_add and 'http://www.xxx.com' in link:
        # print link
        # Findsubmit(link)
        url_add.append(link)
        url_old.append(link)  # 因为已经加入到add，所以算是已知url，就加入old里。
        # except Exception, e:
        #     print e
        #     pass


def Findsubmit(link):
    global reqlist
    try:
        br = mechanize.Browser()  # initiating the browser
        br._factory.is_html = True
        br.addheaders = [('User-agent',
                          'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
        br.open(str(link), timeout=15)
        if br.forms():
            params = list(br.forms())
            for par in params:
                for p in par.controls:
                    ps = str(p)
                    # print p.name
                    if 'TextControl' in ps:
                        param = str(p.name)
                        reqstr = par.action + par.method + param
                        if reqstr not in reqlist:
                            reqlist.append(reqstr)
                            testxss(par.action, par.method, param)
    except Exception, e:
        print e
        pass


def testxss(action, method, param):
    method = method.lower()
    headers = {'content-type': 'application/json',
               'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:22.0) Gecko/20100101 Firefox/22.0'}
    if method == "get":
        print "=" * 10 + "get start" + "=" * 10
        url = action + "/?" + param + "=test1234"
        print url
        # response = requests.get(url,headers=headers)
        # print response.text
        print "=" * 10 + "get end" + "=" * 10
    if method == "post":
        data = {'{0}'.format(param): "test"}
        print "=" * 10 + "post start" + "=" * 10
        print action
        print data
        # response = requests.post(action,data=data,headers=headers)
        # print response.text
        print "=" * 10 + "post end" + "=" * 10


def findlink(input,level=2):
    global url_new
    global url_old
    global url_add
    global middle
    # 总入口
    url_new = []  # level_i级的
    url_old = []  # 所有已经爬过的
    url_add = []  # 每个level_i级新增的链接
    # url = 'http://www.xxx.com'
    url = input
    middle = middle_name(url)
    url_new.append(url)
    for level_i in xrange(level):
        for i in xrange(len(url_new)):
            url_new_i = url_new[i]
            url_old.append(url_new_i)
            sleep(0.5)
            single_url(url_new_i)
        url_new = url_add
    # with open(middle + "_links.txt","w") as f:
        # for line in url_old:
            # f.write(line+"\n")


if __name__ == '__main__':
    # try:
    #     url=sys.argv[1]
    # except Exception:
    #     print "Usage: python findlinks.py www.example.com"
    #     exit()
    # # url = 'http://www.xxx.com'
    findlink('xx.com.cn',10)

 1 #字典方式
 2 #-*-coding:utf-8-*-
 3 
 4 import requests
 5 
 6 def verify(protocol,ip,port):
 7     def get_pass_dict():
 8         pass_dict = []
 9         with open('./dic.txt', 'r') as f:
10             for line in f.readlines():
11                 line = line.strip('\n')
12                 pass_dict.append(line)
13             f.close()
14         return pass_dict
15     dics = get_pass_dict()
16     for dic in dics:
17         url = protocol+'://'+ dic +'.'+ip+':'+str(port)
18         response = requests.get(url,verify=False,timeout=5,allow_redirects=False)
19         if response.status_code == 200:
20             print (url)
21 
22 if __name__ == '__main__':
23     res = verify('http','baidu.com','80')
24     print(res)