"""网络爬虫爬邮箱"""
from bs4 importBeautifulSoupimportrequestsimportrequests.exceptionsfrom urllib.parse importurlsplitfrom collections importdequeimportreimportosimportcsvclassEmailCrawler:"""邮箱爬虫"""
#邮箱正则表达式
__email_addr_pattern = r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+"
def crawl(self, urls, output:str = ''):"""爬取
\n参数: urls - 网址列表或者文件(.txt,.csv); output - 导出文件名"""new_urls= deque() #网址列表
processed_urls = set() #已爬的网址
emails = set() #邮箱地址
if type(urls) isdeque:
new_urls=urlselif type(urls) islist:
new_urls=deque(urls)elif type(urls) isstr:
data=list()ifos.path.exists(urls):
data= self.__readCSVData(urls)else:
data= urls.split(',')
new_urls=deque(data)else:print("不支持的参数!")returnemails"""开始爬取"""
#遍历网址直到结束
whilelen(new_urls):#从队列头部推出一个网址
url =new_urls.popleft()
processed_urls.add(url)#提取基本网址与路径已解决相对链接
parts =urlsplit(url)
base_url= "{0.scheme}://{0.netloc}".format(parts)
path= url[:url.rfind('/')+1] if '/' in parts.path elseurl#获取网址内容
print("Processing %s" %url)try:
response=requests.get(url)except(requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):#忽略页面错误
continue
#提取页面中的所有email,并且将它们添加到结果集
new_emails = set(re.findall(self.__email_addr_pattern, response.text, re.I))if len(new_emails) >0:
emails.update(new_emails)print(new_emails)#给文档创建beautiful soup
soup = BeautifulSoup(response.text, features="lxml")#找到并处理文档中所有的锚
for anchor in soup.find_all('a'):#从锚中提取链接
link = anchor.attrs['href'] if 'href' in anchor.attrs else ''
#处理内部链接
if link.startswith('/'):
link= base_url +linkelif not link.startswith('http'):
link= path +link#添加新链接
if not link in new_urls and not link inprocessed_urls:
new_urls.append(link)if len(output) >0:
self.__writeCSVData(emails, output)returnemailsdef __readCSVData(self, filename):"""读取文件"""data=list()
with open(filename,'r') as f:
f_csv=csv.reader(f)for row inf_csv:
data.append(row[0])returndatadef __writeCSVData(self, data, filename):"""数据写入文件"""with open(filename,'w', newline='') as f:
f_csv=csv.writer(f)
f_csv.writerows(data)if __name__ == '__main__':#urls = 'http://www.themoscowtimes.com'
#urls = ['http://www.themoscowtimes.com']
urls = 'urls.txt'output= 'email.csv'emailCrawl=EmailCrawler()
emailCrawl.crawl(urls, output)