Python 爬虫 提取整个页面的 链接 通用

提取整个页面的 链接 通用

Python
from html.parser import HTMLParser from urllib import parse import requests class LinkFinder(HTMLParser): def __init__(self, base_url, page_url): super().__init__() self.base_url = base_url self.page_url = page_url self.links = set() # When we call HTMLParser feed() this function is called when it # encounters an opening tag <a> def handle_starttag(self, tag, attrs): if tag == 'a': for (attribute, value) in attrs: if attribute == 'href': url = parse.urljoin(self.base_url, value) if self.base_url in url: self.links.add(url) # exclud def page_links(self): return self.links def error(self, message): pass if __name__ == '__main__': finder = LinkFinder("https://www.csai.cn", 'https://www.csai.cn/baoxian/') r = requests.get("https://www.csai.cn/baoxian/") finder.feed(r.text) urls = list(set(finder.page_links())) print(urls)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from html . parser import HTMLParser
from urllib import parse
import requests
 
 
class LinkFinder ( HTMLParser ) :
     def __init__ ( self , base_url , page_url ) :
         super ( ) . __init__ ( )
         self . base_url = base_url
         self . page_url = page_url
         self . links = set ( )
 
     # When we call HTMLParser feed() this function is called when it
     # encounters an opening tag <a>
     def handle_starttag ( self , tag , attrs ) :
         if tag == 'a' :
             for ( attribute , value ) in attrs :
                 if attribute == 'href' :
                     url = parse . urljoin ( self . base_url , value )
                     if self . base_url in url :
                         self . links . add ( url )
     #                     exclud
 
     def page_links ( self ) :
         return self . links
 
     def error ( self , message ) :
         pass
 
 
if __name__ == '__main__' :
     finder = LinkFinder ( "https://www.csai.cn" , 'https://www.csai.cn/baoxian/' )
     r = requests . get ( "https://www.csai.cn/baoxian/" )
     finder . feed ( r . text )
     urls = list ( set ( finder . page_links ( ) ) )
     print ( urls )
 

效果如图




  • zeropython 微信公众号 5868037 QQ号 5868037@qq.com QQ邮箱
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值