#!/usr/bin/python
# -*- coding: UTF-8 -*-
from bs4 import BeautifulSoup
import requests.exceptions
from urllib.parse import urlsplit
from collections import deque
import re
# 一个需要爬行的url队列
new_urls = deque(['https://www.baidu.com/'])
# 一组我们已经爬过的url
processed_urls = set()
emails = set()
# 一个一个地处理url,直到我们耗尽队列
while len(new_urls):
# 将下一个url从队列移动到处理的url集合
url = new_urls.popleft()
processed_urls.add(url)
# 提取基本url以解析相对链接
parts = urlsplit(url)
base_url = "{0.scheme}://{0.netloc}".format(parts)
path = url[:url.rfind('/') 1] if '/' in parts.path else url
# 获取url的内容
print("Processing %s" % url)
try:
response = requests.get(url)
except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
# 忽略页面错误
continue
# 提取所有电子邮件地址并将它们添加到结果集
new_emails = set(re.findall(r"[a-z0-9\.\- _] @[a-z0-9\.\- _] \.[a-z] ", response.text, re.I))
emails.update(new_emails)
# 为html文档创建一个beutiful汤
soup = BeautifulSoup(response.text)
#查找并处理文档中的所有锚
for anchor in soup.find_all("a"):
# 从锚中提取链接url
link = anchor.attrs["href"] if "href" in anchor.attrs else ''
# 解决相对链接
if link.startswith('/'):
link = base_url link
elif not link.startswith('http'):
link = path link
# 如果没有队列或处理,将新url添加到队列中
if not link in new_urls and not link in processed_urls:
new_urls.append(link)