程序主要功能
读取文件内的域名(1k-1w)并将其IP地址和域名对应的地理位置存入数据库
程序实现
**Python2(MySQLdb库,pyDNS库, requests库), MySQL
实现过程
1.从文件头开始按行读取域名(url)
2.将域名进行DNS查询,返回A记录(IP)
3.将返回的IP地址作为变量以
adre_url= “http://www.ip138.com/ips138.asp?ip=“+ip+”&action=2”的方式来访问IP138,获得地理位置信息,从而在爬取相应地理位置字段(adre)
4.将(URI, IP, adre)作为变量存入数据库并同时在linux终端输出
程序运行得到结果如图:
1.存储完数据后的MySQL
2.linux终端
代码:
import DNS
import MySQLdb
import sys
import requests
import re
import time
import os
#通过DNS查询获得URL的A记录
def FindIp(url):
query = sys.argv[0]
DNS.DiscoverNameServers()
reqobj = DNS.Request(url)
answerobj = reqobj.req(name = query, qtype = DNS.Type.A)
if not len(answerobj.answers):
return
for item in answerobj.answers:
ip = ("%s") % (item['data'])
return ip
#用变量IP作为工具,通过爬取对应页面的字段得到
def FindAdre(ip):
if ip:
try:
adre_url= "http://www.ip138.com/ips138.asp?ip="+ip+"&action=2"
adre_get= requests.get(adre_url)
adre_get.raise_for_status()
adre_get.encoding = "gb2312"
keyword = re.compile(r'''<td align="center"><ul class="ul1"><li>(.*?)</li><li>''', re.U|re.S)
unic = keyword.findall(adre_get.text)
str = ''.join(unic)#将list变为str,在Linux终端输出,因为没有将list转变为str,一直不能在终端显示中文,不管怎么改编码都不行,用decode也不行,其实终端一直有提醒我list不能用decode~~~
return str
except:
print "error"#容错,这一步很重要,因为文件内有些URL可能已经过期,DNS无法返回IP,程序一旦得不到IP就会直接停下
else :
return 0
#向数据库传递信息,已经在数据库中建立WangJunX数据库和表单UIA,并且已经建立自增主键 id
def MySQLSave(url, ip, adre):
db = MySQLdb.connect("Ip_adress","User_name","Password","DataBase_name",charset = "utf8")#插入中文数据,在创建表单时可以在表尾加上charset = "utf8"
cursor = db.cursor()
cursor.execute('INSERT INTO UIA (url, ip, adress) values ( %s, %s, %s)', [url, ip, adre])#插入数据
db.commit()#保存数据
return
def main():
start = time.time()
a = open("URL.txt",'r')#打开URL文件
'''fo_0 = open("adre.txt",'w')'''
fo_1 = open("ip.txt",'w')
num = 0
while 1:
if num <=992:
url = a.readline()[:-1]
num = num +1
ip = FindIp(url)
if ip:
adre = FindAdre(ip)
fo_1.write(ip+'\n') #将IP存入,可能有其它用处
print "%-4d %-30s %-40s %-30s" % (num, url, ip, adre)
MySQLSave(url, ip, adre)
else:
fo_1.write("NULL"+'\n')
print "%-4d %-30s %-40s" % (num, url, "NotFound")
else:
break
a.close()
'''fo_0.close()'''
fo_1.close()
end = time.time()
print "TIME: ",end - start, "S"#程序运行时间
if __name__=='__main__':
main()