我明白了。 使用美丽的汤递归URL解析的代码:
import requests
import urllib2
from bs4 import BeautifulSoup
link_set = set()
give_url = raw_input("Enter url:\t")
def magic(give_url, link_set, count):
# print "______________________________________________________"
#
# print "Count is: " + str(count)
# count += 1
# print "THE URL IT IS SCRAPPING IS:" + give_url
page = urllib2.urlopen(give_url)
page_content = page.read()
with open('page_content.html', 'w') as fid:
fid.write(page_content)
response = requests.get(give_url)
html_data = response.text
soup = BeautifulSoup(html_data)
list_items = soup.find_all('a')
for each_item in list_items:
html_link = each_item.get('href')
if(html_link is None):
pass
else:
if(not (html_link.startswith('http') or html_link.startswith('https'))):
link_set.add(give_url + html_link)
else:
link_set.add(html_link)
# print "Total links in the given url are: " + str(len(link_set))
magic(give_url,link_set,0)
link_set2 = set()
link_set3 = set()
for element in link_set:
link_set2.add(element)
count = 1
for element in link_set:
magic(element,link_set3,count)
count += 1
for each_item in link_set3:
link_set2.add(each_item)
link_set3.clear()
count = 1
print "Total links scraped are: " + str(len(link_set2))
for element in link_set2:
count +=1
print "Element number " + str(count) + "processing"
print element
print "\n"
有很多错误,所以我要求你们都请告诉我在哪里可以提高代码。