爬虫基础之深度优先，广度优先策略

最新推荐文章于 2024-07-01 18:09:20 发布

青云--

最新推荐文章于 2024-07-01 18:09:20 发布

阅读量1.4k

点赞数 3

文章标签：爬虫深度优先广度优先

本文链接：https://blog.csdn.net/yinjun3215/article/details/108370389

版权

1.深度优先递归方式;

import re
import requests

headers = {
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41"
}

def get_html(url):
    try:
        res= requests.get(url,headers=headers)
        return res.text
    except:
        return ""


def get_son_url(url):
    # 获取
    html = get_html(url)

    html_re = '<a.*?href="(.*?)".*?>'
    href_list = re.findall(html_re,html,re.S)
    return href_list

def deep_path(url):

    if deepdict[url] > 3:
        return
    print("\t"*deepdict[url],"当前层级:%d" % deepdict[url])
    # 获取子url列表
    sonurl_list = get_son_url(url) #返回的是一个列表
    #遍历所有的子url
    for sonurl in sonurl_list:
        if sonurl.startswith('https') or sonurl.startswith('http'):
            if sonurl not in deepdict:
                deepdict[sonurl] = deepdict[url]+1

                deep_path(sonurl)

if __name__ == '__main__':
    url = "https://www.baidu.com/s?wd=%E6%AD%A6%E6%B1%89%E5%85%89%E8%B0%B7"
    # 控制层级

    deepdict = {}
    deepdict[url] = 1

    deep_path(url)

2.广度优先策略之队列方法：

import re
import requests

headers = {
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41"
}
#获取网页源代码
def get_html(url):
    try:
        res= requests.get(url,headers=headers)
        return res.text
    except:
        return ""

#获取子url列表
def get_son_url(url):
    # 获取
    html = get_html(url)

    html_re = '<a.*?href="(.*?)".*?>'
    href_list = re.findall(html_re,html,re.S)
    return href_list
#广度爬取
def vast_path(url):
    #队列方法   先进先出
    #append 入队列  pop 出队列  用列表 模拟队列
    url_queue = []
    url_queue.append(url) #默认先把第一个放进来

    while len(url_queue)>0:
        #出队列 每次取出一个
        url = url_queue.pop(0)
        print("\t" * deepdict[url],'当前层级:%d'%deepdict[url])

        if deepdict[url]<3:
            #获取子url列表
            sonurl_list = get_son_url(url)
            for sonurl in sonurl_list:
                #过滤出有效链接
                if sonurl.startswith('https') or sonurl.startswith('http'):
                    if sonurl not in deepdict: #过滤重复url
                        deepdict[sonurl] = deepdict[url]+1
                        #入队列
                        url_queue.append(sonurl)

if __name__ == '__main__':
    url = "https://www.baidu.com/s?wd=%E6%AD%A6%E6%B1%89%E5%85%89%E8%B0%B7"
    # 控制层级

    deepdict = {} #控制层级
    deepdict[url] = 1 # 默认第一级

    vast_path(url)

3.深度优先策略之栈方法：

import re
import requests

headers = {
    'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41"
}
#获取网页源代码
def get_html(url):
    try:
        res= requests.get(url,headers=headers)
        return res.text
    except:
        return ""

#获取子url列表
def get_son_url(url):
    # 获取
    html = get_html(url)

    html_re = '<a.*?href="(.*?)".*?>'
    href_list = re.findall(html_re,html,re.S)
    return href_list
#广度爬取
def vast_path(url):
    #队列方法   先进先出
    #append 入栈 pop 出栈 用列表 模拟栈
    url_queue = []
    url_queue.append(url) #默认先把第一个放进来

    while len(url_queue)>0:
        #出栈 每次取出最后一个  
        url = url_queue.pop()
        print("\t" * deepdict[url],'当前层级:%d'%deepdict[url])

        if deepdict[url]<3:
            #获取子url列表
            sonurl_list = get_son_url(url)
            for sonurl in sonurl_list:
                #过滤出有效链接
                if sonurl.startswith('https') or sonurl.startswith('http'):
                    if sonurl not in deepdict: #过滤重复url
                        deepdict[sonurl] = deepdict[url]+1 #子url相比父url层级+1
                        #入队列
                        url_queue.append(sonurl)

if __name__ == '__main__':
    url = "https://www.baidu.com/s?wd=%E6%AD%A6%E6%B1%89%E5%85%89%E8%B0%B7"
    # 控制层级

    deepdict = {} #控制层级
    deepdict[url] = 1 # 默认第一级

    vast_path(url)

万水千山总是情，点个关注行不行。

青云--

关注

3
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
爬虫基础之深度优先，广度优先策略

1.深度优先递归方式;import reimport requestsheaders = { 'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36 Edg/85.0.564.41"}def get_html(url): try: res= requests.ge
复制链接

扫一扫