def crawl_sitemap(url):
html = ''
#download the sitemap file
sitemap = download_page(url, 2)
# extract the sitemap links
links = re.findall('<loc>(.*?)</loc>',sitemap)
#load each link
for link in links:
html = download_page(link,2)
if __name__ == '__main__':
url = "https://www.meetup.com/"
url = 'https://zhidao.baidu.com/question/2073804096754701028.html'
url = 'http://example.webscraping.com/sitemap.xml '
crawl_sitemap(url)
# page_buf = download_page(url, 2 , '127.0.0.1:8087')
output:
downloading: http://example.webscraping.com/sitemap.xml
downloading: http://example.webscraping.com/view/Afghanistan-1
downloading: http://example.webscraping.com/view/Aland-Islands-2
downloading: http://example.webscraping.com/view/Albania-3
download failed: timed out
downloading: http://example.webscraping.com/view/Algeria-4
downloading: http://example.webscraping.com/view/American-Samoa-5
downloading: http://example.webscraping.com/view/Andorra-6
download failed: timed out
downloading: http://example.webscraping.com/view/Angola-7
downloading: http://example.webscraping.com/view/Anguilla-8
downloading: http://example.webscraping.com/view/Antarctica-9
download failed: timed out
downloading: http://example.webscraping.com/view/Antigua-and-Barbuda-10
download failed: timed out
downloading: http://example.webscraping.com/view/Argentina-11
downloading: http://example.webscraping.com/view/Armenia-12
downloading: http://example.webscraping.com/view/Aruba-13
downloading: http://example.webscraping.com/view/Australia-14
downloading: http://example.webscraping.com/view/Austria-15