二级域名爬虫

# -*- coding:utf-8 -*-
from tkinter import *
from bs4 import BeautifulSoup
import re
import urllib.request

root = Tk()
root.title('这是一个二级域名获取爬虫')

frame=Frame(root)
frame.pack(padx=10,pady=10)
# photo = PhotoImage(file="bg.gif")
# w=photo.width()*2
# h=photo.height()*2
# print(w,h)
w=800
h=600
root.geometry('%dx%d+0+0' % (w,h))
theLabel = Label(frame,
                 text="请输入顶级域名,如:baidu.com",
                 justify=LEFT,
#                 image=photo,
                 compound=LEFT,
                 font=("华康少女字体", 14)
                 ).grid(row=0,column=0)
# theLabel.pack()
def getIP(site):
    siteFormat1 = site
    siteFormat1 = siteFormat1.replace('.', '\.')
    #print siteFormat1
    
    urlPage = 'http://www.haosou.com/s?src=360sou_newhome&q=site:'+site
    req = urllib.request.Request(urlPage)
    req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0')
    res = urllib.request.urlopen(req)
    html = res.read().decode('utf-8')
    # print(html)
    # 获得搜索结果的页面数
    pageStr = re.search(u'找到相关结果约(.*?)个',html)
    #print(pageStr)
    page = pageStr.group(1)
#     print(page)
    formatNum = '0123456789'
    for c in page:
        if not c in formatNum:
            page = page.replace(c,'')
    page = int(page) / 10
    print('Total Page: ' + str(page))
    
    if page > 6:
        page = 6
    newItems = []
    for p in range(1, int(page)):
        urlDomain = 'http://www.haosou.com/s?src=360sou_newhome&q=site:'+site+'&pn='+str(p)
        req = urllib.request.Request(urlDomain)
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0')
        res = urllib.request.urlopen(req)
        html = res.read().decode('utf-8')
        tmp = 'linkinfo\"\>\<cite\>(.+?\.'+siteFormat1+')'
        pattern = re.compile(tmp)        
        items = re.findall(pattern, html)
        
        # 去重操作
        for item in items:
            if item not in newItems:  
                newItems.append(item)
    
    print('SubDomain Count: '+ str(len(newItems)))
    for item in newItems:
        print(item)
    for item in newItems:
        urlIP='http://ip.chinaz.com/'+item
        req = urllib.request.Request(urlIP)
        req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0')
        res = urllib.request.urlopen(req)
        html = res.read().decode('utf-8')
    #     print(html)
        soup=BeautifulSoup(html,"lxml")
        try:        
            panel=soup.find(class_="WhwtdWrap bor-b1s col-gray03")
            ip=panel.contents
#             print(ip[1].string+'   '+ip[3].string+'     '+ip[7].string)
            ipss=ip[1].string+'   '+ip[3].string+'     '+ip[7].string
            theLB.insert(END,ipss) 
        except:
            continue
def check():
    theLB.delete(0,END)
    ipss="域名/IP"+'   '+"获取的IP地址"+'     '+"IP的物理地址"
    theLB.insert(END,ipss)
    getIP(v.get())
#     theLB.insert(END,v.get())
    
v=StringVar()
e=Entry(frame,textvariable=v,width=25,font=("华康少女字体", 14)).grid(row=1,column=0)

Button(frame,text='开始获取',command=check,bg="green",fg="white").grid(row=1,column=1)

sb=Scrollbar(root)
sb.pack(side=RIGHT,fill=Y)

theLB=Listbox(root,yscrollcommand=sb.set,width=w,font=("华康少女字体", 14))

# theLB.pack()
# 
# for item in range(10):
#     theLB.insert(END,item)

theLB.pack(side=LEFT,fill=BOTH)

sb.config(command=theLB.yview)

mainloop()

 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值