通过正则表达式找到当前页面中的所有URL,储存在set中(剔除重复),用类似图数据结构的深度优先遍历算法遍历set,实现全站爬虫。
from urllib import request
from bs4 import BeautifulSoup as bs
import re
import time
url = "http://xxxxx.jinan.cn/"
visited = set()
def get_local_pages(url):
try:
time.sleep(1)
web = request.urlopen(url=url)
except:
print("Open url",url, "failed error!")
return
soup = bs(web.read(), 'html.parser')
tags = soup.find_all(name="a") #a标签储存URL,找到所有a标签
r = re.compile(r'href="/.+?\.html"') #使用非贪婪模式
pages_temp = set()
pages = set()
for tag in tags:
tag = str(tag)
urls = r.findall(tag)
if urls:
pages_temp.add(urls[0])
for page in pages_temp:
if page[7:11] != "jnyzh":
page = "http://xxxxx.jinan.cn/" + page[7:-1]
pages.add(page)
print(pages)
return pages
def dfs(pages): #使用类似图的深度优先遍历方法递归遍历URL
global visited
if pages is set():
return
for page in pages:
if page not in visited:
print("Visiting",page)
visited.add(page)
url = page
print("A")
pages = get_local_pages(url)
dfs(pages)
pages = get_local_pages(url)
dfs(pages)