# coding:utf8
import requests
from bs4 import BeautifulSoup
import urlparse
import datetime
import sys
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',}
timeout = 5
requests.packages.urllib3.disable_warnings()
class UrlManager:
def __init__(self):
self.new_urls = set()
self.old_urls = set()
def add_new_url(self, url):
if url is None:
return None
for u in url:
if u not in self.new_urls and u not in self.old_urls:
with open(datefile, 'a') as f:
f.write(u + '\n')
self.new_urls.add(u)
def add_new_urls(self, urls):
if urls is None or len(urls) == 0:
return None
self.add_new_url(urls)
def has_new_url(self):
return len(self.new_urls) != 0
def get_new_url(self):
new_url = self.new_urls.pop()
http_url = new_url if '://' in new_url else 'http://' + new_url
self.old_urls.add(new_url)
return http_url
class HtmlDownLoader():
def download(self, url):
if url is None:
return None
r = requests.get(url=url,headers=headers, timeout=timeout, allow_redirects=True, stream=True, verify=False)
if r.status_code != 200:
return None
return r.text
class HtmlParser:
def __init__(self):
self.foreign_urls = set()
def _get_root_domain(self, url):
if url is None:
return None
url_info = urlparse.urlparse(url)
root_domain = url_info.netloc
if root_domain is not None and root_domain is not '':
return root_domain
def _get_new_urls(self, soup, current_url):
new_urls = set()
links = soup.find_all("a")
for link in links:
new_url = link.get('href')
if new_url is not None:
new_url = new_url.strip()
new_url_root_domain = self._get_root_domain(new_url)
if new_url_root_domain != self._get_root_domain(current_url):
if new_url_root_domain is not None:
new_urls.add(new_url_root_domain)
return new_urls
def parse(self, html_content, current_url):
if html_content is None:
return
soup = BeautifulSoup(html_content, "html.parser")
new_urls = self._get_new_urls(soup, current_url)
return new_urls
def get_foreign_urls(self):
return self.foreign_urls
class SpiderMain:
def __init__(self, ):
global datefile
self.urls = UrlManager()
self.html_downloader = HtmlDownLoader()
self.parser = HtmlParser()
def craw(self, root_url):
self.urls.add_new_url(root_url)
while self.urls.has_new_url():
new_url = self.urls.get_new_url()
try:
html_content = self.html_downloader.download(new_url)
new_urls = self.parser.parse(html_content, new_url)
self.urls.add_new_urls(new_urls)
# TODO
for i in new_urls:
print "-- %s" % i
except Exception:
print "++ %s" % new_url
except KeyboardInterrupt:
print '[-] User exit!'
sys.exit(1)
if __name__ == "__main__":
datefile = datetime.datetime.now().strftime('%Y%m%d_%H-%M-%S.txt')
with open(datefile, 'w'):
pass
root_url = set()
root_url.add("www.hao123.com")
obj_spider = SpiderMain()
obj_spider.craw(root_url)