from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('http://wiki.hk.wjbk.site/wiki/Kevin_Bacon')
bs = BeautifulSoup(html, 'html.parser')
for link in bs.find_all('a'):
if 'href' in link.attrs:
print(link.attrs['href'])
报错:
HTTPError: HTTP Error 302: The HTTP server returned a redirect error that would lead to an infinite loop.
The last 30x error message was:
Moved Temporarily
解决方法:加 headers
from bs4 import BeautifulSoup
import requests
headers = {
'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}
res = requests.get("http://wiki.hk.wjbk.site/wiki/Kevin_Bacon", headers=headers)
bs = BeautifulSoup(res.text, 'html.parser')
for link in bs.find_all('a'):
if 'href' in link.attrs:
print(link.attrs['href'])
'''
输出结果:
/w/index.php?title=Kevin_Bacon&redirect=no
#mw-head
#p-search
https://wiki.hk.wjbk.site/baike-File:Kevin_Bacon_Comic-Con_2012.jpg
https://wiki.hk.wjbk.site/baike-%E7%BE%8E%E5%9C%8B
https://wiki.hk.wjbk.site/baike-%E8%B3%93%E5%A4%95%E6%B3%95%E5%B0%BC%E4%BA%9E%E5%B7%9E
https://wiki.hk.wjbk.site/baike-%E8%B2%BB%E5%9F%8E
https://wiki.hk.wjbk.site/baike-%E5%A7%AC%E5%A8%9C%C2%B7%E8%96%9B%E5%9F%9F
/w/index.php?title=%E6%B8%BE%E8%BA%AB%E6%98%AF%E5%8B%81&action=edit&redlink=1
https://wiki.hk.wjbk.site/baike-%E8%AA%B0%E6%AE%BA%E4%BA%86%E7%94%98%E8%BF%BA%E8%BF%AA
https://wiki.hk.wjbk.site/baike-%E8%BB%8D%E5%AE%98%E8%88%87%E9%AD%94%E9%AC%BC
https://wiki.hk.wjbk.site/baike-%E9%A9%9A%E6%BF%A4%E9%A7%AD%E6%B5%AA
/w/index.php?title=%E9%BB%91%E7%8D%84%E9%A2%A8%E9%9B%B2&action=edit&redlink=1
https://wiki.hk.wjbk.site/baike-%E9%98%BF%E6%B3%A2%E7%BD%9713%E5%8F%B7_(%E7%94%B5%E5%BD%B1)
https://wiki.hk.wjbk.site/baike-%E6%9D%B0%E5%85%8B%C2%B7%E6%96%AF%E5%A8%81%E6%A0%BC%E7%89%B9
https://wiki.hk.wjbk.site/baike-%E7%A5%9E%E7%A7%98%E6%B2%B3%E6%B5%81
/w/index.php?title=%E9%9D%9E%E6%B3%95%E5%88%B6%E8%A3%81&action=edit&redlink=1
...
'''
bingo!