提取整个页面的 链接 通用
from html.parser import HTMLParser from urllib import parse import requests class LinkFinder(HTMLParser): def __init__(self, base_url, page_url): super().__init__() self.base_url = base_url self.page_url = page_url self.links = set() # When we call HTMLParser feed() this function is called when it # encounters an opening tag <a> def handle_starttag(self, tag, attrs): if tag == 'a': for (attribute, value) in attrs: if attribute == 'href': url = parse.urljoin(self.base_url, value) if self.base_url in url: self.links.add(url) # exclud def page_links(self): return self.links def error(self, message): pass if __name__ == '__main__': finder = LinkFinder("https://www.csai.cn", 'https://www.csai.cn/baoxian/') r = requests.get("https://www.csai.cn/baoxian/") finder.feed(r.text) urls = list(set(finder.page_links())) print(urls)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
from
html
.
parser
import
HTMLParser
from
urllib
import
parse
import
requests
class
LinkFinder
(
HTMLParser
)
:
def
__init__
(
self
,
base_url
,
page_url
)
:
super
(
)
.
__init__
(
)
self
.
base_url
=
base_url
self
.
page_url
=
page_url
self
.
links
=
set
(
)
# When we call HTMLParser feed() this function is called when it
# encounters an opening tag <a>
def
handle_starttag
(
self
,
tag
,
attrs
)
:
if
tag
==
'a'
:
for
(
attribute
,
value
)
in
attrs
:
if
attribute
==
'href'
:
url
=
parse
.
urljoin
(
self
.
base_url
,
value
)
if
self
.
base_url
in
url
:
self
.
links
.
add
(
url
)
# exclud
def
page_links
(
self
)
:
return
self
.
links
def
error
(
self
,
message
)
:
pass
if
__name__
==
'__main__'
:
finder
=
LinkFinder
(
"https://www.csai.cn"
,
'https://www.csai.cn/baoxian/'
)
r
=
requests
.
get
(
"https://www.csai.cn/baoxian/"
)
finder
.
feed
(
r
.
text
)
urls
=
list
(
set
(
finder
.
page_links
(
)
)
)
print
(
urls
)
|