linux的shell版url爬虫

最新推荐文章于 2022-03-07 22:48:51 发布

光年之外～

最新推荐文章于 2022-03-07 22:48:51 发布

阅读量295

点赞数

分类专栏：理论

本文链接：https://blog.csdn.net/qq_41671415/article/details/118656585

版权

理论专栏收录该内容

25 篇文章 0 订阅

订阅专栏

这个脚本采用广度优先搜索（BFS）策略，从用户输入的URL开始，探测指定层数的链接，避免已知路径，探索网站的整体结构。通过剔除特定域名并进行URL去重，最终将结果保存到httpbug.txt文件中，为网站爬虫提供了一种基础实现。

摘要由CSDN通过智能技术生成

小制作，大佬勿喷，原理也就链式反应，有兴趣的可以在其基础上进行改版增强，

#!/bin/bash
echo "请注意此脚本是广度优先工作！！！"
# egrep -o "https?://[a-zA-Z0-9\.+\/*]*"
i=3
while (($i >= 0 ))
do

#url=https://www.zstack.io/
read  -p  "输入一个url和探测层数,空格隔开: " url deep
read  -p "输入要排除的网站：" except
num=`echo $url | egrep -o "https?://[a-zA-Z0-9\.+\/*]*"`
re=`echo $?`

#判断区
if (($re==1))
then

echo -e "请输入正确url\n"
else
break   #跳出

fi

echo "还有$i次机会"
c=$((i-=1))

done
#第一层： 
reurl=`curl $url |  egrep -o "https?://[a-zA-Z0-9\.+\/*]*" > httpbug.txt `

top=1   #设置起始点
#嵌套层      
#deep=${deep:-99}
#循环层
while (($deep > 0 ))
do
#文件url去重
cat httpbug.txt | sort | uniq  > rebug1.txt
cat rebug1.txt > httpbug.txt
con=`cat httpbug.txt`
len=`cat httpbug.txt |wc -l`

echo -e "第$deep层遍历\n"

        while (($len >= $top ))
                do

                        i=`echo -e "$con\n" |sed -n ''"$top"'p'`
                        echo $i,目前层数$deep
                   curl -m 3 -s $i |  egrep -o "https?://[a-zA-Z0-9\.+\/*]*"  >> httpbug.txt
                        echo "-------"
#文件url去重
cat httpbug.txt | sort | uniq  > rebug1.txt
cat rebug1.txt > httpbug.txt
                        top=$((top+=1))
        done
 #     sed -i -e '/$except/d' httpbug.txt
    sed -i -e '/w3c\|google.com\|w3.org/d' httpbug.txt
  deep=$((deep-=1))
done

echo "内容存在httpbug.txt"               
#curl  -s  https://www.zstack.io/cases/ | egrep -o "href=".*/\"$""  |egrep -v ".*//.*"| egrep -o "\/.*\/"