import urlparse
import urllib2
from bs4 import BeautifulSoup
import re
import sys
import signal
argc = len(sys.argv)
url = 'www.xxxx.com'
if argc > 1 and sys.argv[1].strip():
url = sys.argv[1]
if not re.search('^http', url):
url = 'http://'+url
threshold = 0
if argc > 2 and sys.argv[2].strip():
threshold = int(sys.argv[2])
print 'Searching from '+ url+' ... '
print 'ATTENTION: This would take LONG time to scan EVERYTHING!'
print ' Hit CTRL + C to stop me!'
urls = [url]
visited = [url]
maillist = []
f = open("out.txt", "w")
while len(urls) >0:
try:
htmltext = urllib2.urlopen(urls[0]).read()
soup = BeautifulSoup(htmltext)
except:
pass
# print "OOPS! " + urls[0]
urls.pop(0)
# print len(urls)
for tag in soup.findAll('a', href=True):
if re.search('.htm',tag['href']) or re.search('.asp',tag['href']):
tag['href'] = urlparse.urljoin(url, tag['href'])
if url in tag['href'] and tag['href'] not in visited:
# print tag['href']
urls.append(tag['href'])
visited.append(tag['href'])
try:
for mail in soup.body.findAll(text=re.compile('^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$')):
if mail not in maillist:
maillist.append(mail)
print mail
print >> f,mail
except:
pass
# print "OOPS! " + mail
if (threshold != 0) and (len(maillist) >= threshold):
break
f.close()
print "Total mail addresses : ", len(maillist)
一个简单的python爬虫程序
最新推荐文章于 2024-02-18 13:16:32 发布