深度优先搜索算法(英语:Depth-First-Search,简称DFS)是一种用于遍历或搜索树或图的算法。 沿着树的深度遍历树的节点,尽可能深的搜索树的分支。当节点v的所在边都己被探寻过或者在搜寻时结点不满足条件,搜索将回溯到发现节点v的那条边的起始节点。整个进程反复进行直到所有节点都被访问为止。
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
#深度优先类
class DFS():
# 属性
def __init__(self):
# 1. 容器
self.dfs = []
self.crawled = []
# 方法
# 1. 存
def save_url(self,url):
if url not in self.crawled:
self.dfs.append(url)
# 2. 取
def get_url(self):
url = self.dfs.pop()
self.crawled.append(url)
return url
class Crawl():
def __init__(self):
self.dfs = DFS()
def crawler(self):
url = 'http://www.xbiquge.la/'
self.dfs.save_url(url)
while True:
try:
new_url = self.dfs.get_url()
print(new_url)
res = requests.get(url=new_url, headers=headers,timeout=3)
ele = etree.HTML(res.content.decode())
urls = ele.xpath("//a/@href")
print(ele.xpath("//title/text()"))
for i in urls:
if self.check_url(i):
self.dfs.save_url(i)
print(self.dfs.dfs)
except:
pass
def check_url(self,url):
if (url.startswith('http://') or url.startswith('https://')) and not url.endswith('.exe'):
return True
else:
return False
if __name__ == '__main__':
crawl = Crawl()
crawl.crawler()
宽度优先搜索算法(又称广度优先搜索)是最简便的图的搜索算法之一,这一算法也是很多重要的图的算法的原型。Dijkstra单源最短路径算法和Prim最小生成树算法都采用了和宽度优先搜索类似的思想。其别名又叫BFS,属于一种盲目搜寻法,目的是系统地展开并检查图中的所有节点,以找寻结果。换句话说,它并不考虑结果的可能位置,彻底地搜索整张图,直到找到结果为止。
import requests
from lxml import etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'
}
#广度优先类
class BFS():
# 属性
def __init__(self):
# 1. 容器
self.bfs = []
self.crawled = []
# 方法
# 1. 存
def save_url(self,url):
if url not in self.crawled:
self.bfs.append(url)
# 2. 取
def get_url(self):
url = self.bfs.pop(0)
self.crawled.append(url)
return url
class Crawl():
def __init__(self):
self.bfs = BFS()
def crawler(self):
url = 'http://www.baidu.com'
self.bfs.save_url(url)
while True:
try:
new_url = self.bfs.get_url()
print(new_url)
res = requests.get(url=new_url, headers=headers,timeout=3)
ele = etree.HTML(res.content.decode())
urls = ele.xpath("//a/@href")
print(ele.xpath("//title/text()"))
for i in urls:
if self.check_url(i):
self.bfs.save_url(i)
print(self.bfs.bfs)
except:
pass
def check_url(self,url):
if (url.startswith('http://') or url.startswith('https://')) and not url.endswith('.exe'):
return True
else:
return False
if __name__ == '__main__':
crawl = Crawl()
crawl.crawler()