#python 2
#-*-coding:utf-8-*-
import requests
import re
key='qq.com'
sites=[]
match='style="text-decoration:none;">(.*?)/'
for i in range(48):
i=i*10
url="http://www.baidu.com.cn/s?wd=site:"+key+"&cl=3&pn=%s" % i
response=requests.get(url).content
subdomains=re.findall(match,response)
sites += list(subdomains)
site=list(set(sites)) #set()实现去重
#print site
print "The number of sites is %d" % len(site)
for i in site:
print i
python3 需要在get返回的数据进行编码转换
#python 3
import requests
import re
key='qq.com'
sites=[]
match='style="text-decoration:none;">(.*?)/'
for i in range(48):
i=i*10
url="http://www.baidu.com.cn/s?wd=site:"+key+"&cl=3&pn=%s" % i
response=requests.get(url).content
subdomains=re.findall(match,response.decode('utf8'))
sites += list(subdomains)
site=list(set(sites)) #set()实现去重
#print site
print("The number of sites is %d" % len(site))
for i in site:
print(i)
唉,最近百度有点顶,要加上一层伪装,才可以爬到域名。现把解决方法放出,难顶。
#python 3
import requests
import re
key='jd.com'
sites=[]
head = {'User-Agent': \
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'}
match = 'style="text-decoration:none;">(.*?)</b>'
for i in range(20):
url="https://www.baidu.com/s?ie=UTF-8&wd=inurl%3A"+key
response=requests.get(url,headers=head).content
subdomains=re.findall(match,response.decode('utf8'))
print(subdomains)
sites += list(subdomains)
site=list(set(sites)) #set()实现去重
#print site
print("The number of sites is %d" % len(site))
for i in site:
print(i)
import requests
import re
head = {'User-Agent': \
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'}
key = 'jd.com' # 这里填主域名
lst = []
match = 'style="text-decoration:none;">(.*?)</b>'
for i in range(1, 20): # 1-19页
url = "https://www.baidu.com/s?wd=inurl:{}&pn={}&oq={}&ie=utf-8".format(key, i, key)
print(url)
# response = requests.get(url,headers=head,cookies = cook).content
response = requests.get(url, headers=head).content
subdomains = re.findall(match, response.decode())
for j in subdomains:
j = j.replace('<b>', '')
if key in j:
if j not in lst:
lst.append(j)
# print(lst)
print(lst)