代码如下
'''
更新TLD对应的whois服务器并获取IP,更新表
==================================
auther:wud
date:2018/1/17
ver:1.0
'''
import ConfigParser
import urllib2
import urllib
import MySQLdb
import re
from time import sleep
def GetTLDfromMysql():
'''从数据库获取没有whois_server的TLD'''
f = open("TLD.txt", 'w')
try:
db = MySQLdb.connect(ip, user, passwd, database, port=int(port), charset="utf8")
if db:
print "数据库连接成功_1"
cursor = db.cursor()
cursor.execute('select TLD from whois_tld_addr where whois_addr is NULL')
TLD_info = cursor.fetchall()
TLD_size = len(TLD_info)
print >> f, TLD_size
for i in TLD_info:
print >>f, i[0]
f.close()
except:
print "连接数据库失败_1"
def CrawTLDWhoisSer():
'''从INNA获取TLDwhois_server数据'''
f = open("TLD.txt", 'r')
f1 = open("test.txt", 'r+')
time = f.readline()
int_time = int(time)
print "没有whois_server的TLD数量是-->", int_time
while(int_time):
tld = f.readline()
url = "https://www.iana.org/domains/root/db/" + tld[1:-1] + ".html"
try:
html_text = urllib.urlopen(url).read()
try:
whoisSer_keywords = re.compile(r'''<b>WHOIS Server:</b>(.*?)</p>''', re.U | re.S)
whoisSer = ''.join(whoisSer_keywords.findall(html_text))
if whoisSer:
print >> f1, tld[:-1], "whois_servre is -->", whoisSer
print "保存成功"
int_time -= 1
else:
print "没找到对应的whois_server"
int_time -= 1
except:
print "内容匹配出错"
int_time -= 1
sleep(1)
print int_time
except:
print "爬虫出错"
int_time -= 1
print int_time
sleep(0.1)
f.close()
'''
def getContent(url, headers):
random_header = random.choice(headers)
req = urllib2.Request(url)
req.add_header("User-Agent", random_header)
req.add_header("GET", url)
req.add_header("Host", "blog.csdn.net")
req.add_header("Referer", "http://www.csdn.net/")
content = urllib2.urlopen(req).read()
return content
'''
def main():
GetTLDfromMysql()
CrawTLDWhoisSer()
if __name__ == '__main__':
conf = ConfigParser.ConfigParser()
conf.read("Cncertwhoisdatabase.conf")
ip = conf.get("section1", "ip")
user = conf.get("section1", "user")
passwd = conf.get("section1", "passwd")
port = conf.get("section1", "port")
database = conf.get("section1", "database")
main()