我将如何做到这一点:
import grequests
from bs4 import BeautifulSoup
def get_urls_from_response(r):
soup = BeautifulSoup(r.text)
urls = [link.get('href') for link in soup.find_all('a')]
return urls
def print_url(args):
print args['url']
def recursive_urls(urls):
"""
Given a list of starting urls, recursively finds all descendant urls
recursively
"""
if len(urls) == 0:
return
rs = [grequests.get(url, hooks=dict(args=print_url)) for url in urls]
responses = grequests.map(rs)
url_lists = [get_urls_from_response(response) for response in responses]
urls = sum(url_lists, []) # flatten list of lists into a list
recursive_urls(urls)
我没有测试过代码,但总体思路就在那里.
请注意,我使用grequests而不是性能提升请求. grequest基本上是gevent请求,根据我的经验,这种任务要快得多,因为你检索与gevent异步的链接.
编辑:这里是不使用递归的相同算法:
import grequests
from bs4 import BeautifulSoup
def get_urls_from_response(r):
soup = BeautifulSoup(r.text)
urls = [link.get('href') for link in soup.find_all('a')]
return urls
def print_url(args):
print args['url']
def recursive_urls(urls):
"""
Given a list of starting urls, recursively finds all descendant urls
recursively
"""
while True:
if len(urls) == 0:
break
rs = [grequests.get(url, hooks=dict(args=print_url)) for url in urls]
responses = grequests.map(rs)
url_lists = [get_urls_from_response(response) for response in responses]
urls = sum(url_lists, []) # flatten list of lists into a list
if __name__ == "__main__":
recursive_urls(["INITIAL_URLS"])