如题
任务内容
1.用Domain得到A(ip)记录(DNS请求)
2.用得到的ip地址来获得地理位置信息
2.1用爬虫来获得地理位置信息
2.2用查询数据表来获得地理位置信息
最后将Domain,A记录,NS记录,地理位置信息存入MySQL
直接上代码
import DNS
import sys
import requests
import re
import time
import MySQLdb
from time import ctime
global num
#global db
#db = MySQLdb.connect("数据库ip","数据库登陆名","数据库密码","表名",charset = "utf8")
'''
1.From file URL.txt geting url && build query tree;
2.DNS qeury get ip;
3.delete point and get first three ip number;
4.ip_f <= 93 tree reserch find ip address && ip_f >93 Craw ip address from ip138;
5.save url,ip,address;
'''
def findip(url):
#url = f.readline()[:-1]
print url
query = sys.argv[0]
DNS.DiscoverNameServers()
reqobj = DNS.Request(url)
answerjob = reqobj.req(name = query, qtype = DNS.Type.A)
if not len(answerjob.answers):
print "IP NOT FOUND\n"
return
for item in answerjob.answers:
A_record = "%s" % item['data']
return A_record
def findNS(url):
#url = f.readline()[:-1]
query = sys.argv[0]
DNS.DiscoverNameServers()
reqobj = DNS.Request(url)
answerjob = reqobj.req(name = query, qtype = DNS.Type.NS)
if not len(answerjob.answers):
print "NS :NOT FOUND"
return
for item in answerjob.answers:
NS_record = "%s" % item['data']
print "NS :", NS_record
return NS_record
def delpoint_Findadd(ip):
ip_long = ip #xxx.xxx.xxx.xxx
fir_point = ip_long.index('.')
ip_0 = ip_long[:fir_point] #xxx
ip_long_1 = ip_long[fir_point+1:] #xxx.xxx.xxx
sec_point = ip_long_1.index(".")
ip_1 = ip_long_1[:sec_point] #xxx
ip_long_2 = ip_long_1[sec_point+1:] #xxx.xxx
thi_point = ip_long_2.index(".")
ip_2 = ip_long_2[:thi_point] #xxx
ip_3 = ip_long_2[thi_point+1:] #xxx
ip_ent = ip_0+" "+ip_1+" "+ip_2+" "+ip_3
'''
print type(ip_ent)
print type(ip_0)
ip_0 = int(ip_0)
print type(ip_0)
print ip_0
print ip_0<0
'''
ip_0 = int(ip_0)
ip_1 = int(ip_1)
ip_2 = int(ip_2)
if ip_0 > 93:
print ip_0, ">" , "93"
Domain_local = findadre_Craw(ip)
return Domain_local
#buildtree(ip)
elif ip_0 <= 93 :
print ip_0, "<=", "93"
Domain_local = buildtree(ip,ip_0,ip_1,ip_2)
return Domain_local
else:
print "NOT FOUND IP OR DELETE POINR ERROR"
return
#return ip_ent
def buildtree(ip,ip_0,ip_1,ip_2):
f = open("/home/wangjunx/Desktop/adre.txt",'r')
linenum = 0
beg = time.time()
while linenum < 33116:
ip = f.readline()[:-1].decode("gb2312")
# print ip
point = ip
ip_start = ip[:ip.index(" ")]
# print "ip_start" ,"=", ip_start
ip_end = ip[ip.index(" ")+1:ip.index(" ")+1+ip[ip.index(" ")+1:].index(" ")]
# print " " in ip[ip.index(" ")+1:]
# print ip[ip.index(" ")+1:]
# print ip[ip.index(" ")+1:].index(" ")
# print "ip_end","=",ip_end
ip_address = ip[ip.index(" ")+1+ip[ip.index(" ")+1:].index(" ")+1:]
# print "ip_adress","=",ip_adress
ip_f = ip_start[:ip_start.index(".")]
# print ip_f
ip_s = ip_start[ip_start.index(".")+1 : ip_start.index(".")+1+ip_start[ip_start.index(".")+1:].index(".")]
# print ip_s
ip_start_re = ip_start[::-1]
ip_t = ip_start_re[ip_start_re.index(".")+1 : ip_start_re.index(".")+1+ip_start_re[ip_start_re.index(".")+1:].index(".")][::-1]
# ip_t_false = ip_start[ip_start[ip_start.index(".")+1:].index(".")+1:] #xx.xx
# ip_t = ip_t_false[:ip_t_false.index(".")]
# print ip_t
ip_F = ip_end[:ip_end.index(".")]
# print ip_F
ip_S = ip_end[ip_end.index(".") + 1: ip_end.index(".") + 1+ip_end[ip_end.index(".") + 1:].index(".")]
# print ip_S
ip_end_re = ip_end[::-1]
ip_T = ip_end_re[ip_end_re.index(".") + 1: ip_end_re.index(".") + 1+ip_end_re[ip_end_re.index(".") + 1:].index(".")][::-1]
# ip_T_false = ip_end[ip_end[ip_end.index(".") + 1:].index(".") + 1:] # xx.xx
# ip_T = ip_T_false[:ip_T_false.index(".")]
# print ip_T
ip_f = int(ip_f)
ip_s = int(ip_s)
ip_t = int(ip_t)
ip_F = int(ip_F)
ip_S = int(ip_S)
ip_T = int(ip_T)
if ip_0 == ip_f:
if ip_1==ip_s and ip_1==ip_S:
if ip_2 <= ip_T and ip_2 >= ip_t:
print "USING WAY:------->IP TABLE"
print "IP_LINE and IP_PRAGRAPH IS: ",linenum, ip
print "IP_address is :",ip_address
end = time.time()
print "USING TIME=",end-beg, "\n"
break
if ip_1 == ip_s and ip_1<ip_S:
if ip_2 >= ip_t:
print "USING WAY:------->IP TABLE"
print "IP_LINE and IP_PRAGRAPH IS: ", linenum, ip
print "IP_address is :", ip_address
end = time.time()
print "USING TIME=", end - beg,"\n"
break
if ip_1 ==ip_S and ip_1>ip_s:
if ip_2 <= ip_T:
print "USING WAY:------->IP TABLE"
print "IP_LINE and IP_PRAGRAPH IS: ", linenum, ip
print "IP_address is :", ip_address
end = time.time()
print "USING TIME=", end - beg, "\n"
break
if ip_1<ip_S and ip_1>ip_s:
print "USING WAY:------->IP TABLE"
print "IP_LINE and IP_PRAGRAPH IS: ", linenum, ip
print "IP_address is :", ip_address
end = time.time()
print "USING TIME=", end - beg,"\n"
break
else :
linenum = linenum + 1
'''
print linenum
print "ip = ", ip
print "ip_start = ",ip_start
print "<", ip_f, ip_s, ip_t, ">"
print "ip_end = ",ip_end
print "<", ip_F, ip_S, ip_T, ">"
print "ip_address = ",ip_address
'''
#print ctime(),"\n"
#end = time.time()
#if result:
# print "USING TIME = ", end - beg, "\n"
#else :
# print "THAT'S A PITY!", "\n"
f.close()
return ip_address
def findadre_Craw(ip):
start = time.time()
url = "http://www.ip138.com/ips138.asp?ip="+ip+"&action=2"
adre_get = requests.get(url, timeout = 100)
adre_get.raise_for_status()
adre_get.encoding = adre_get.apparent_encoding
keywords = re.compile(r'''<td align="center"><ul class="ul1"><li>(.*?)</li><li>''', re.U|re.S)
ADRESS = keywords.findall(adre_get.text)
Domain_local = ''.join(ADRESS)[5:]
end = time.time()
print "IP_address is",Domain_local
print "USING WAY:-------> CRAWURL-->(IP138)"
print "USING TIME : ", end - start, "\n"
return Domain_local
#return Domain_local
'''
def InsertIntoMySQL(url,ip,NS,ip_address):
db = MySQLdb.connect("MySQL IP","USER NAME","PASSWORD","DATABASE NAME",charset = "utf8")
db = MySQLdb.connect("localhost","USER NAME","PASSWORD","DATANAME NAME",charset = "utf8")
cursor = db.cursor()
cursor.execute('INSERT INTO Domain_info (Domain, A_record, Domain_local, NS_record, Insert_time) values ( %s, %s, %s, %s, %s)', [url, ip, ip_address, NS, ctime()])
db.commit()
return
'''
def main():
#db = MySQLdb.connect("MySQL IP", "USERNAME", "PASSWORD", "DATABASE NAME", charset="utf8")
a = time.time()
f = open ("/home/wangjunx/Desktop/URL.txt",'r')
f1 = open ("/home/wangjunx/Desktop/ip.txt",'w')
num = 1
while num<1975:
print "NUM = ",num
url = f.readline()[:-1]
ip = findip(url)
NS = findNS(url)
if ip:
print ip
ip_address = delpoint_Findadd(ip)
#InsertIntoMySQL(url,ip,NS,ip_address)
else :
print "ERROR MAIN() \n"
num = num + 1
print "------------------------------------"
f.close()
f1.close()
#db.commit()
#InsertintoMySQL()
a = open("/home/wangjunx/Desktop/ip.txt",'r')
print "-----------SHOW IP.TXT -------------"
print a.readlines()
b = time.time()
print "------------------------------------"
print "PROGRAM TOTAL TIME:", b-a, "Seconds"
if __name__=='__main__':
main()
运行截图
当然在数据库中存入了数据(无图)
虽然ip表单查询速度和爬虫速度相差不大,但是确实减少了网络资源的使用
就酱