python,DNS,MySQL,爬虫

9 篇文章 0 订阅
3 篇文章 0 订阅

如题


任务内容
1.用Domain得到A(ip)记录(DNS请求)
2.用得到的ip地址来获得地理位置信息
2.1用爬虫来获得地理位置信息
2.2用查询数据表来获得地理位置信息
最后将Domain,A记录,NS记录,地理位置信息存入MySQL


直接上代码

import DNS
import sys
import requests
import re
import time
import MySQLdb
from time import ctime

global num
#global db
#db = MySQLdb.connect("数据库ip","数据库登陆名","数据库密码","表名",charset = "utf8")
'''
1.From file URL.txt geting url && build query tree;
2.DNS qeury get ip;
3.delete point and get first three ip number;
4.ip_f <= 93 tree reserch find ip address && ip_f >93 Craw ip address from ip138;
5.save url,ip,address;
'''

def findip(url):
    #url = f.readline()[:-1]
    print url
    query = sys.argv[0]
    DNS.DiscoverNameServers()
    reqobj = DNS.Request(url)
    answerjob = reqobj.req(name = query, qtype = DNS.Type.A)
    if not len(answerjob.answers):
        print "IP NOT FOUND\n"
        return
    for item in answerjob.answers:
        A_record = "%s" % item['data']
    return A_record

def findNS(url):
    #url = f.readline()[:-1]
    query = sys.argv[0]
    DNS.DiscoverNameServers()
    reqobj = DNS.Request(url)
    answerjob = reqobj.req(name = query, qtype = DNS.Type.NS)
    if not len(answerjob.answers):
        print "NS :NOT FOUND"
        return
    for item in answerjob.answers:
        NS_record = "%s" % item['data']
        print "NS :", NS_record
    return NS_record

def delpoint_Findadd(ip):
    ip_long = ip                         #xxx.xxx.xxx.xxx
    fir_point = ip_long.index('.')
    ip_0 = ip_long[:fir_point]           #xxx
    ip_long_1 = ip_long[fir_point+1:]    #xxx.xxx.xxx
    sec_point = ip_long_1.index(".")
    ip_1 = ip_long_1[:sec_point]         #xxx
    ip_long_2 = ip_long_1[sec_point+1:]  #xxx.xxx
    thi_point = ip_long_2.index(".")
    ip_2 = ip_long_2[:thi_point]         #xxx
    ip_3 = ip_long_2[thi_point+1:]       #xxx
    ip_ent = ip_0+" "+ip_1+" "+ip_2+" "+ip_3
    '''
    print type(ip_ent)
    print type(ip_0)
    ip_0 = int(ip_0)
    print type(ip_0)
    print ip_0
    print ip_0<0
    '''
    ip_0 = int(ip_0)
    ip_1 = int(ip_1)
    ip_2 = int(ip_2)
    if ip_0 > 93:
        print ip_0, ">" , "93"
        Domain_local = findadre_Craw(ip)
        return Domain_local
        #buildtree(ip)
    elif ip_0 <= 93 :
        print ip_0, "<=", "93"
        Domain_local = buildtree(ip,ip_0,ip_1,ip_2)
        return Domain_local
    else:
        print "NOT FOUND IP OR DELETE POINR ERROR"
    return
        #return ip_ent



def buildtree(ip,ip_0,ip_1,ip_2):
    f = open("/home/wangjunx/Desktop/adre.txt",'r')
    linenum = 0
    beg = time.time()
    while linenum < 33116:
        ip = f.readline()[:-1].decode("gb2312")
       # print ip
        point = ip
        ip_start = ip[:ip.index(" ")]
       # print "ip_start" ,"=", ip_start
        ip_end = ip[ip.index(" ")+1:ip.index(" ")+1+ip[ip.index(" ")+1:].index(" ")]
       # print " " in ip[ip.index(" ")+1:]
       # print ip[ip.index(" ")+1:]
       # print ip[ip.index(" ")+1:].index(" ")
       # print "ip_end","=",ip_end
        ip_address = ip[ip.index(" ")+1+ip[ip.index(" ")+1:].index(" ")+1:]
       # print "ip_adress","=",ip_adress
        ip_f = ip_start[:ip_start.index(".")]
       # print ip_f
        ip_s = ip_start[ip_start.index(".")+1 : ip_start.index(".")+1+ip_start[ip_start.index(".")+1:].index(".")]
       # print ip_s
        ip_start_re = ip_start[::-1]
        ip_t = ip_start_re[ip_start_re.index(".")+1 : ip_start_re.index(".")+1+ip_start_re[ip_start_re.index(".")+1:].index(".")][::-1]
       # ip_t_false = ip_start[ip_start[ip_start.index(".")+1:].index(".")+1:] #xx.xx
       # ip_t = ip_t_false[:ip_t_false.index(".")]
       # print ip_t
        ip_F = ip_end[:ip_end.index(".")]
       # print ip_F
        ip_S = ip_end[ip_end.index(".") + 1: ip_end.index(".") + 1+ip_end[ip_end.index(".") + 1:].index(".")]
       # print ip_S
        ip_end_re = ip_end[::-1]
        ip_T = ip_end_re[ip_end_re.index(".") + 1: ip_end_re.index(".") + 1+ip_end_re[ip_end_re.index(".") + 1:].index(".")][::-1]
       # ip_T_false = ip_end[ip_end[ip_end.index(".") + 1:].index(".") + 1:]  # xx.xx
       # ip_T = ip_T_false[:ip_T_false.index(".")]
       # print ip_T
        ip_f = int(ip_f)
        ip_s = int(ip_s)
        ip_t = int(ip_t)
        ip_F = int(ip_F)
        ip_S = int(ip_S)
        ip_T = int(ip_T)
        if ip_0 == ip_f:
            if ip_1==ip_s and ip_1==ip_S:
                if ip_2 <= ip_T and ip_2 >= ip_t:
                    print "USING WAY:------->IP TABLE"
                    print "IP_LINE and IP_PRAGRAPH IS: ",linenum, ip
                    print "IP_address is :",ip_address
                    end = time.time()
                    print "USING TIME=",end-beg, "\n"
                    break
            if ip_1 == ip_s and ip_1<ip_S:
                if ip_2 >= ip_t:
                    print "USING WAY:------->IP TABLE"
                    print "IP_LINE and IP_PRAGRAPH IS: ", linenum, ip
                    print "IP_address is :", ip_address
                    end = time.time()
                    print "USING TIME=", end - beg,"\n"
                    break
            if ip_1 ==ip_S and ip_1>ip_s:
                if ip_2 <= ip_T:
                    print "USING WAY:------->IP TABLE"
                    print "IP_LINE and IP_PRAGRAPH IS: ", linenum, ip
                    print "IP_address is :", ip_address
                    end = time.time()
                    print "USING TIME=", end - beg, "\n"
                    break
            if ip_1<ip_S and ip_1>ip_s:
                print "USING WAY:------->IP TABLE"
                print "IP_LINE and IP_PRAGRAPH IS: ", linenum, ip
                print "IP_address is :", ip_address
                end = time.time()
                print "USING TIME=", end - beg,"\n"
                break
        else :
            linenum = linenum + 1

        '''
        print linenum
        print "ip = ", ip
        print "ip_start = ",ip_start
        print "<", ip_f, ip_s, ip_t, ">"
        print "ip_end = ",ip_end
        print "<", ip_F, ip_S, ip_T, ">"
        print "ip_address = ",ip_address
        '''
        #print ctime(),"\n"
    #end = time.time()
    #if result:
    #    print "USING TIME = ", end - beg, "\n"
    #else :
    #    print "THAT'S A PITY!", "\n"
    f.close()
    return ip_address



def findadre_Craw(ip):
    start = time.time()
    url = "http://www.ip138.com/ips138.asp?ip="+ip+"&action=2"

    adre_get = requests.get(url, timeout = 100)
    adre_get.raise_for_status()
    adre_get.encoding = adre_get.apparent_encoding
    keywords = re.compile(r'''<td align="center"><ul class="ul1"><li>(.*?)</li><li>''', re.U|re.S)
    ADRESS = keywords.findall(adre_get.text)
    Domain_local = ''.join(ADRESS)[5:]
    end = time.time()
    print "IP_address is",Domain_local
    print "USING WAY:-------> CRAWURL-->(IP138)"
    print "USING TIME : ", end - start, "\n"
    return Domain_local
    #return Domain_local
'''

def InsertIntoMySQL(url,ip,NS,ip_address):
    db = MySQLdb.connect("MySQL IP","USER NAME","PASSWORD","DATABASE NAME",charset = "utf8")
    db = MySQLdb.connect("localhost","USER NAME","PASSWORD","DATANAME NAME",charset = "utf8")
    cursor = db.cursor()
    cursor.execute('INSERT INTO Domain_info (Domain, A_record, Domain_local, NS_record, Insert_time) values ( %s, %s, %s, %s, %s)', [url, ip, ip_address, NS,  ctime()])
    db.commit()
    return
'''

def main():
    #db = MySQLdb.connect("MySQL IP", "USERNAME", "PASSWORD", "DATABASE NAME", charset="utf8")
    a = time.time()
    f = open ("/home/wangjunx/Desktop/URL.txt",'r')
    f1 = open ("/home/wangjunx/Desktop/ip.txt",'w')
    num = 1
    while num<1975:
        print "NUM = ",num
        url = f.readline()[:-1]
        ip = findip(url)
        NS = findNS(url)
        if ip:
            print ip
            ip_address = delpoint_Findadd(ip)
            #InsertIntoMySQL(url,ip,NS,ip_address)
        else :
            print "ERROR MAIN() \n"
        num = num + 1
    print "------------------------------------"
    f.close()
    f1.close()
    #db.commit()
    #InsertintoMySQL()
    a = open("/home/wangjunx/Desktop/ip.txt",'r')
    print "-----------SHOW IP.TXT -------------"
    print a.readlines()
    b = time.time()
    print "------------------------------------"
    print "PROGRAM TOTAL TIME:", b-a, "Seconds"

if __name__=='__main__':
    main()

运行截图
这里写图片描述


当然在数据库中存入了数据(无图)


虽然ip表单查询速度和爬虫速度相差不大,但是确实减少了网络资源的使用


就酱


  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值