import re
import urllib.request
def grab(rule, url):
"""
:param rule: 抓取规则
:param url: 抓取网址
:return: 结果的集合
"""
my_list = []
for line in urllib.request.urlopen(url):
my_list.extend(rule.findall(line.decode('gbk')))
return set(my_list)
mail_rule = re.compile(r'([A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4})', re.IGNORECASE)
target_url = 'http://zhidao.baidu.com/question/754696990784600804.html'
target_set = grab(mail_rule, target_url)
print(target_set)
{'cnoom@126.com', '543793688@qq.com', 'www.634827374@qq.com'}