import re
import urllib.request
def getlink(url):
# 模拟成浏览器
headers = ("User-Agent",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
# opener 为全局安装
urllib.request.install_opener(opener)
file = urllib.request.urlopen(url)
date = str(file.read())
# 根据要求构建好链接的表达式
pattern = '(https?://[^\s)";]+\.(\w|/)*)'
link = re.compile(pattern).findall(date)
# 去除重复元素
link = list(set(link))
return link
# 要爬取的网页链接
url = "http://blog.csdn.net/"
# 获取对应网页中包含的链接地址
linklist = getlink(url)
for link in linklist:
print(link[0], '\t', link[1])
链接爬虫实战
最新推荐文章于 2024-08-15 17:38:46 发布