通过爬取IANA获得TLD对应的whois服务器

代码如下

# -*- coding:utf-8 -*-
'''
更新TLD对应的whois服务器并获取IP,更新表
==================================
auther:wud
date:2018/1/17
ver:1.0
'''

import ConfigParser
import urllib2
import urllib
import MySQLdb
import re
from time import sleep

def GetTLDfromMysql():
    '''从数据库获取没有whois_server的TLD'''
    f = open("TLD.txt", 'w')
    #print ip, user, passwd, port, database
    try:
        db = MySQLdb.connect(ip, user, passwd, database, port=int(port), charset="utf8")
        if db:
            print "数据库连接成功_1"
            cursor = db.cursor()
            cursor.execute('select TLD from whois_tld_addr where whois_addr is NULL')
            TLD_info = cursor.fetchall()
            TLD_size = len(TLD_info)
            print >> f, TLD_size
            for i in TLD_info:
                #print i[0]
                print >>f, i[0]
        f.close()
    except:
        print "连接数据库失败_1"

def CrawTLDWhoisSer():
    '''从INNA获取TLDwhois_server数据'''
    f = open("TLD.txt", 'r')
    f1 = open("test.txt", 'r+')
    time = f.readline()
    int_time = int(time)
    print "没有whois_server的TLD数量是-->", int_time
    while(int_time):
        tld = f.readline()
        url = "https://www.iana.org/domains/root/db/" + tld[1:-1] + ".html"
        #print url
        try:
            #my_headers = ["Mozilla/5.0 (Windows NT 6.3; Win64; x64) 。。。 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36"]
            html_text = urllib.urlopen(url).read()#获取内容
            #print html_text
            try:
                whoisSer_keywords = re.compile(r'''<b>WHOIS Server:</b>(.*?)</p>''', re.U | re.S)
                whoisSer = ''.join(whoisSer_keywords.findall(html_text))
                if whoisSer:
                    print >> f1, tld[:-1], "whois_servre is -->", whoisSer
                    print "保存成功"
                    int_time -= 1
                else:
                    print "没找到对应的whois_server"
                    int_time -= 1
            except:
                print "内容匹配出错"
                int_time -= 1
            sleep(1)#防ban
            print int_time
        except:
            print "爬虫出错"
            int_time -= 1
            print int_time
            sleep(0.1)#防ban
    f.close()
'''
def getContent(url, headers):
    random_header = random.choice(headers)
    req = urllib2.Request(url)
    req.add_header("User-Agent", random_header)
    req.add_header("GET", url)
    req.add_header("Host", "blog.csdn.net")
    req.add_header("Referer", "http://www.csdn.net/")
    content = urllib2.urlopen(req).read()
    return content
'''

def main():
    GetTLDfromMysql()
    CrawTLDWhoisSer()

if __name__ == '__main__':
    conf = ConfigParser.ConfigParser()
    conf.read("Cncertwhoisdatabase.conf")
    ip = conf.get("section1", "ip")
    user = conf.get("section1", "user")
    passwd = conf.get("section1", "passwd")
    port = conf.get("section1", "port")
    database = conf.get("section1", "database")
    main()
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值