有时候需要通过搜索引擎,获取大量具有一定特征的url地址,人工一页页翻太麻烦了,写了个脚本用来获取,并在获取到后获取当前网页的title
使用环境:
python3
requests
bs4
lxml
from bs4 import BeautifulSoup
import requests
import sys
def get_url(google_hack,start,stop):
headers = {
'Host':'www.baidu.com',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36',
}
indexUrl = 'https://www.baidu.com'
num = int(start) +int(stop)-1
for stop in range((int(start) -1) *10, num*10, 10):
targetUrl = indexUrl + '/s?wd=' + google_hack + '&pn=' + str(stop) + '&oq=' + google_hack + '&tn=93063693_hao_pg&ie=utf-8&usm=1&rsv_pq=93cdb6350000eadd&rsv_t=150bff5LzGew8qDr0ARHTq%2BNBvCwnE7s0KgrfxwcY5Sqc4xAsDyOFQIo%2FUOfuybbSkFMa5Cz&rsv_jmp=slow'
r = requests.get(targetUrl, headers=headers, timeout=15)
detail = BeautifulSoup(r.content, 'lxml')
for x in detail.find_all('div'):
link = x.get('data-tools')
if link:
try:
url = str(link)[link.find('"url"'):]
url = url[7:-2]#截取url中的内容
r = requests.get(url)
final_url = r.url
title = get_title(r.content.decode(r.apparent_encoding,'replace').encode('utf-8','replace').decode('utf-8'))
print(final_url+' '+title)
except Exception as e:
print(e)
def get_title(data):
title = ''
try:
title = data.split('</title>')[0].split('<title>')[1].strip()
except Exception as e:
print(e)
return title
if __name__ == '__main__':
msg = '''
python3 %s google_hack start_page stop_page
example:
python3 %s "inurl:asp?id=1" 0 20
'''%(sys.argv[0],sys.argv[0])
if len(sys.argv) < 4:
print(msg)
exit(1)
google_hack = sys.argv[1]
start = sys.argv[2]
stop = sys.argv[3]
get_url(google_hack,start,stop)
使用效果图
转载于:https://blog.51cto.com/z2ppp/1979149