通过爬取IANA获得TLD对应的whois服务器

最新推荐文章于 2022-02-01 20:31:02 发布

WjXIIIIIIIIII

最新推荐文章于 2022-02-01 20:31:02 发布

阅读量660

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/WU_DENG9495/article/details/79097205

版权

python 专栏收录该内容

18 篇文章 0 订阅

订阅专栏

代码如下

# -*- coding:utf-8 -*-
'''
更新TLD对应的whois服务器并获取IP,更新表
==================================
auther:wud
date:2018/1/17
ver:1.0
'''

import ConfigParser
import urllib2
import urllib
import MySQLdb
import re
from time import sleep

def GetTLDfromMysql():
    '''从数据库获取没有whois_server的TLD'''
    f = open("TLD.txt", 'w')
    #print ip, user, passwd, port, database
    try:
        db = MySQLdb.connect(ip, user, passwd, database, port=int(port), charset="utf8")
        if db:
            print "数据库连接成功_1"
            cursor = db.cursor()
            cursor.execute('select TLD from whois_tld_addr where whois_addr is NULL')
            TLD_info = cursor.fetchall()
            TLD_size = len(TLD_info)
            print >> f, TLD_size
            for i in TLD_info:
                #print i[0]
                print >>f, i[0]
        f.close()
    except:
        print "连接数据库失败_1"

def CrawTLDWhoisSer():
    '''从INNA获取TLDwhois_server数据'''
    f = open("TLD.txt", 'r')
    f1 = open("test.txt", 'r+')
    time = f.readline()
    int_time = int(time)
    print "没有whois_server的TLD数量是-->", int_time
    while(int_time):
        tld = f.readline()
        url = "https://www.iana.org/domains/root/db/" + tld[1:-1] + ".html"
        #print url
        try:
            #my_headers = ["Mozilla/5.0 (Windows NT 6.3; Win64; x64) 。。。 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"]
            html_text = urllib.urlopen(url).read()#获取内容
            #print html_text
            try:
                whoisSer_keywords = re.compile(r'''<b>WHOIS Server:</b>(.*?)</p>''', re.U | re.S)
                whoisSer = ''.join(whoisSer_keywords.findall(html_text))
                if whoisSer:
                    print >> f1, tld[:-1], "whois_servre is -->", whoisSer
                    print "保存成功"
                    int_time -= 1
                else:
                    print "没找到对应的whois_server"
                    int_time -= 1
            except:
                print "内容匹配出错"
                int_time -= 1
            sleep(1)#防ban
            print int_time
        except:
            print "爬虫出错"
            int_time -= 1
            print int_time
            sleep(0.1)#防ban
    f.close()
'''
def getContent(url, headers):
    random_header = random.choice(headers)
    req = urllib2.Request(url)
    req.add_header("User-Agent", random_header)
    req.add_header("GET", url)
    req.add_header("Host", "blog.csdn.net")
    req.add_header("Referer", "http://www.csdn.net/")
    content = urllib2.urlopen(req).read()
    return content
'''

def main():
    GetTLDfromMysql()
    CrawTLDWhoisSer()

if __name__ == '__main__':
    conf = ConfigParser.ConfigParser()
    conf.read("Cncertwhoisdatabase.conf")
    ip = conf.get("section1", "ip")
    user = conf.get("section1", "user")
    passwd = conf.get("section1", "passwd")
    port = conf.get("section1", "port")
    database = conf.get("section1", "database")
    main()

WjXIIIIIIIIII

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
通过爬取IANA获得TLD对应的whois服务器

代码如下# -*- coding:utf-8 -*-'''更新TLD对应的whois服务器并获取IP,更新表==================================auther:wuddate:2018/1/17ver:1.0'''import ConfigParserimport urllib2import urllibimport MySQLdbi
复制链接

扫一扫

专栏目录