# -*- coding:utf-8 -*-
from tkinter import *
from bs4 import BeautifulSoup
import re
import urllib.request
root = Tk()
root.title('这是一个二级域名获取爬虫')
frame=Frame(root)
frame.pack(padx=10,pady=10)
# photo = PhotoImage(file="bg.gif")
# w=photo.width()*2
# h=photo.height()*2
# print(w,h)
w=800
h=600
root.geometry('%dx%d+0+0' % (w,h))
theLabel = Label(frame,
text="请输入顶级域名,如:baidu.com",
justify=LEFT,
# image=photo,
compound=LEFT,
font=("华康少女字体", 14)
).grid(row=0,column=0)
# theLabel.pack()
def getIP(site):
siteFormat1 = site
siteFormat1 = siteFormat1.replace('.', '\.')
#print siteFormat1
urlPage = 'http://www.haosou.com/s?src=360sou_newhome&q=site:'+site
req = urllib.request.Request(urlPage)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0')
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
# print(html)
# 获得搜索结果的页面数
pageStr = re.search(u'找到相关结果约(.*?)个',html)
#print(pageStr)
page = pageStr.group(1)
# print(page)
formatNum = '0123456789'
for c in page:
if not c in formatNum:
page = page.replace(c,'')
page = int(page) / 10
print('Total Page: ' + str(page))
if page > 6:
page = 6
newItems = []
for p in range(1, int(page)):
urlDomain = 'http://www.haosou.com/s?src=360sou_newhome&q=site:'+site+'&pn='+str(p)
req = urllib.request.Request(urlDomain)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0')
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
tmp = 'linkinfo\"\>\<cite\>(.+?\.'+siteFormat1+')'
pattern = re.compile(tmp)
items = re.findall(pattern, html)
# 去重操作
for item in items:
if item not in newItems:
newItems.append(item)
print('SubDomain Count: '+ str(len(newItems)))
for item in newItems:
print(item)
for item in newItems:
urlIP='http://ip.chinaz.com/'+item
req = urllib.request.Request(urlIP)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0')
res = urllib.request.urlopen(req)
html = res.read().decode('utf-8')
# print(html)
soup=BeautifulSoup(html,"lxml")
try:
panel=soup.find(class_="WhwtdWrap bor-b1s col-gray03")
ip=panel.contents
# print(ip[1].string+' '+ip[3].string+' '+ip[7].string)
ipss=ip[1].string+' '+ip[3].string+' '+ip[7].string
theLB.insert(END,ipss)
except:
continue
def check():
theLB.delete(0,END)
ipss="域名/IP"+' '+"获取的IP地址"+' '+"IP的物理地址"
theLB.insert(END,ipss)
getIP(v.get())
# theLB.insert(END,v.get())
v=StringVar()
e=Entry(frame,textvariable=v,width=25,font=("华康少女字体", 14)).grid(row=1,column=0)
Button(frame,text='开始获取',command=check,bg="green",fg="white").grid(row=1,column=1)
sb=Scrollbar(root)
sb.pack(side=RIGHT,fill=Y)
theLB=Listbox(root,yscrollcommand=sb.set,width=w,font=("华康少女字体", 14))
# theLB.pack()
#
# for item in range(10):
# theLB.insert(END,item)
theLB.pack(side=LEFT,fill=BOTH)
sb.config(command=theLB.yview)
mainloop()