“”"
1、通过百度搜索 toutiao.com ,爬取一页的子域名信息
2、随意输入需要查询的域名,得到子域名
3、可以爬取百度搜索中的多页内容
4、使用多线程,加快子域名爬取速度
5、子域名写入到subdomain/domain.txt
“”"
百度搜索
我们需要的值是ad.toutiao.com
google hack 语法
链接有什么特点
首先我们的目的是自定义一个域名得到相关的子域名
import requests
url = "https://www.baidu.com/s?wd=site:toutiao.com"
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
resp = requests.get(url,headers=headers)
print(resp.text)
实现输出
寻找值
下一步解析数据
import requests
import re
url = "https://www.baidu.com/s?wd=site:toutiao.com"
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
resp = requests.get(url,headers=headers)
print(resp.text)
html_str = resp.text
寻找规律
.正则表达式——(.*?) 闪亮登场:
class=“c-showurl c-color-gray” style=“text-decoration:none;position:relative;”>cc.toutiao.com/
class=“c-showurl c-color-gray” .*?>cc.toutiao.com/
class=“c-showurl c-color-gray” .?>(.?)<
import requests
import re
url = "https://www.baidu.com/s?wd=site:toutiao.com"
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
resp = requests.get(url,headers=headers)
html_str = resp.text
#数据解析
pattern = re.compile('class="c-showurl c-color-gray".*?>(.*?)<')
results = re.findall(pattern,html_str)
print(results)
进行数据清洗
import requests
import re
from urllib.parse import urlencode
url = "https://www.baidu.com/s?wd=site:toutiao.com"
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
resp = requests.get(url,headers=headers)
html_str = resp.text
#数据解析
pattern = re.compile('class="c-showurl c-color-gray".*?>(.*?)<')
results = re.findall(pattern,html_str)
#print(results)
for res in results:
if "." in res:
if "/" in res:
domain = res.split('/')[0]
print(domain)
自定义查询子域名
import requests
import re
from urllib.parse import urlencode
#查询子域名
def scan_domain(domain):
url = "https://www.baidu.com/s?wd=site:%s"%domain
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
resp = requests.get(url, headers=headers)
html_str = resp.text
# 数据解析
pattern = re.compile('class="c-showurl c-color-gray".*?>(.*?)<')
results = re.findall(pattern, html_str)
sub_domains = [] # 定义一个空的列表
for res in results:
if "." in res:
if "/" in res:
sub_d = res.split('/')[0]
sub_domains.append(sub_d)
return sub_domains
if __name__ == "__main__":
domain = input("Input scan domain:")
sub_domains = scan_domain(domain)
print(sub_domains)
接下来爬取多页
翻页观察链接特点
import requests
import re
from urllib.parse import urlencode
#查询子域名
def scan_domain(domain, pages):
sub_domains = []
for page in range(pages):
url = "https://www.baidu.com/s?wd=site:%s&pn=%d" % (domain, page * 10)
headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
resp = requests.get(url, headers=headers)
html_str = resp.text
# 数据解析
pattern = re.compile('class="c-showurl c-color-gray".*?>(.*?)<')
results = re.findall(pattern, html_str)
for res in results:
if "." in res:
if "/" in res:
sub_d = res.split('/')[0]
sub_domains.append(sub_d)
return sub_domains
if __name__ == "__main__":
domain = input("Input scan domain:")
pages = int(input("input scrapy page:")) # 整型
sub_domains = scan_domain(domain,pages)
print(sub_domains)
多线程:
import requests
import re
from urllib.parse import urlencode
import threading
#查询子域名
def scan_domain(domain,page):
sub_domains = []
url = "https://www.baidu.com/s?wd=site:%s&pn=%d"%(domain,page*10)
print(url)
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36"
}
resp = requests.get(url,headers=headers)
html_str = resp.text
#数据解析
pattern = re.compile('class="c-showurl c-color-gray".*?>(.*?)<')
results = re.findall(pattern,html_str)
for res in results:
if "." in res:
if "/" in res:
sub_d = res.split('/')[0]
sub_domains.append(sub_d)
print(sub_domains)
return sub_domains
if __name__ == "__main__":
domain = input("Input scan domain:")
pages = int(input("input scrapy page:"))
# sub_domains = scan_domain(domain,pages)
threads = []
for page in range(pages):
t = threading.Thread(target=scan_domain,args=(domain,page))
threads.append(t)
for t in threads:
t.start()
(未完)