gevent实现爬虫并与普通多线程比较
1.爬虫目的与思路简介;
2.gevent实现爬虫;
3.多线程实现爬虫;
4.对比。
爬虫目的与设计
1.从一个主页爬取相关链接;
2.比如麦子学院主站;
3.http://www.maiziedu.com;
4.从此网站爬取内容,分析过滤有效的不重复的网址(不包含图片,视频等资源地址);
5.但是为了控制结束条件,可以设置爬虫深入和总数目,并可以适当控制并发度。
gevent实现爬虫
主要利用greenlet来执行具体爬虫操作,pool来控制并发度。
代码实例:
#!/usr/bin/env python
#-*- coding: utf-8 -*-
import re
import urllib
import urlparse
from pyquery import PyQuery
class HtmlAnalyzer(object):
@staticmethod
def detectCharSet(html):
pq = PyQuery(html)
metas = pq('head')('meta')
for meta in metas:
for key in meta.keys():
if key == "charset":
charset = meta.get('charset')
return charset
elif key == "content":
try:
content = meta.get('content')
p = re.match(r".+charset=(.*)\W*", content)
charset = p.group(1)
return charset
except:
continue
@staticmethod
def extractLinks(html, baseurl, charset):
def _extract(url, attr):
link = url.attrib[attr]
link = link.strip("/ ").strip('\\"')
if link is None:
raise
link = urlparse.urljoin(baseurl, link)
link = urlparse.urldefrag(link)[0]
try:
link = urllib.quote(link, ':?=+&#/@')
except (UnicodeDecodeError, KeyError):
try:
link = urllib.quote(link.encode(charset), ':?=+&#/@')
except:
pass
return link
def _isValidLink(url):
try:
return all([UrlFilter.checkScheme(url),
UrlFilter.checkInvalidChar(url),
UrlFilter.checkInvalidExtention(url)
])
except:
return False
pq = PyQuery(html)
allLinks = []
for url in pq('a'):
try:
link = _extract(url, 'href')
except:
continue
if _isValidLink(link):
allLinks.append(link)
for url in pq('form'):
try:
link = _extract(url, 'action')
except:
continue
if _isValidLink(link):
allLinks.append(link)
return allLinks
class UrlFilter(object):
invalid_chars = {'\'': None,
'\"': None,
'\\': None,
' ': None,
'\n': None,
'\r': None,
'+': None
}
invalid_extention = {
'jpg': None,
'gif': None,
'bmp': None,
'jpeg': None,
'png': None,
'swf': None,
'mp3': None,
'wma': None,
'wmv': None,
'wav': None,
'mid': None,
'ape': None,
'mpg': None,
'mpeg': None,
'rm': None,
'rmvb': None,
'avi': None,
'mkv': None,
'zip': None,
'rar': None,
'gz': None,
'iso': None,
'jar': None,
'doc': None,
'docx': None,
'ppt': None,
'pptx': None,
'chm': None,
'pdf': None,
'exe': None,
'msi': None,
}
@staticmethod
def checkScheme(url):
scheme, netloc, path, pm, q, f = urlparse.urlparse(url)
return scheme in ('http', 'https')
@classmethod
def checkInvalidChar(cls, url):
exist_invalid_char = False
for c in url:
if c in cls.invalid_chars:
exist_invalid_char = True
break
return (not exist_invalid_char)
@classmethod
def checkInvalidExtention(cls, url):
dotpos = url.rfind('.') + 1
typestr = url[dotpos:].lower()
return (typestr not in cls.invalid_extention)
Geventspider.py
import gevent
from gevent import (
monkey,
queue,
pool
)
import requests
from threading import Timer
from utils import HtmlAnalyzer
class Strategy(object):
def __init__(self, max_depth, max_count, concurrency=5):
self.max_depth = max_depth
self.max_count = max_count
self.concurrency = concurrency
self.timeout = 60
self.time = 12*60
class UrlObject(object):
def __init__(self, url, depth):
self.url = url.strip('/')
self.depth = depth
class GeventSpider(object):
def __init__(self, max_depth, max_count, root_url):
monkey.patch_all()
self.strategy = Strategy(max_depth=max_depth, max_count=max_count)
self.queue = queue.Queue()
self.pool = pool.Pool(self.strategy.concurrency)
self.url_set = set()
obj = UrlObject(root_url, 0)
self.put(obj)
self.handle_num = 0
def put(self, obj):
hash_val = hash(obj.url)
if hash_val not in self.url_set:
self.url_set.add(hash_val)
self.queue.put(obj)
def stop(self):
self.timer.cancel()
self.pool.join()
def _run_loop(self):
while self.timer.isAlive():
for geeenlet in list(self.pool):
if geeenlet.dead:
self.pool.discard(geeenlet)
try:
url = self.queue.get()
except queue.Empty:
continue
greentlet = Handler(url, self)
self.pool.start(greentlet)
self.handle_num = self.handle_num + 1
if self.strategy.max_count <= self.handle_num:
print 'handlernum %d is full so stop' %self.handle_num
self.stop()
def run(self):
self.timer = Timer(self.strategy.time, self.stop)
self.timer.start()
self._run_loop()
class Handler(gevent.Greenlet):
def __init__(self, urlobj, spider):
print 'begin greenlet with url ', urlobj.url
gevent.Greenlet.__init__(self)
self.urlobj = urlobj
self.spider = spider
self.charset = "utf-8"
def open(self, url):
strategy = self.spider.strategy
try:
resp = requests.get(url, timeout = strategy.timeout)
except:
return
if resp.status_code != requests.codes.ok:
return
charset = HtmlAnalyzer.detectCharSet(resp.text)
if charset is not None:
self.charset = charset
resp.encoding = charset
return resp.text
def _run(self):
try :
html = self.open(self.urlobj.url)
except Exception,why:
return
depth = self.urlobj.depth + 1
if depth > self.spider.strategy.max_depth:
return
for link in self.feed(html):
if hash(link) in self.spider.url_set:
continue
url = UrlObject(link, depth)
self.spider.put(url)
def feed(self, html):
return HtmlAnalyzer.extractLinks(html, self.urlobj.url, self.charset)
class MySpider(object):
def __init__(self, max_depth, max_count, root_url):
self.spider = GeventSpider(max_depth=max_depth,
max_count=max_count,
root_url=root_url)
def run(self):
self.spider.run()
test = MySpider(max_depth=5, max_count=100, root_url="http://www.maiziedu.com")
test.run()
Threadspider.py
import requests
from threading import Timer, Lock, Thread
from utils import HtmlAnalyzer
from Queue import Queue
import time
class Strategy(object):
def __init__(self, max_depth, max_count, concurrency=5):
self.max_depth = max_depth
self.max_count = max_count
self.concurrency = concurrency
self.timeout = 60
self.time = 12*60
class UrlObject(object):
def __init__(self, url, depth):
self.url = url.strip('/')
self.depth = depth
class ThreadSpider(object):
def __init__(self, max_depth, max_count, root_url):
self.strategy = Strategy(max_depth, max_count)
self.queue = Queue()
self.url_set = set()
self.handler_num = 0
self.lock = Lock()
self.thread_lock = Lock()
self.thread_pool = {}
self.thread_id = 0
self.is_stop = False
self.thread_num = 0
self.currency_limit = False
self.last_data = None
obj = UrlObject(root_url, 0)
self.put(obj)
def put(self, obj):
hash_val = hash(obj.url)
self.lock.acquire()
res = hash_val in self.url_set
self.lock.release()
if res:
return
self.url_set.add(hash_val)
self.queue.put(obj)
def _run_loop(self):
while True:
if self.is_stop:
time.sleep(1)
continue
if self.currency_limit:
time.sleep(1)
self.thread_lock.acquire()
self.thread_num = len(self.thread_pool)
if self.thread_num == self.strategy.concurrency:
self.thread_lock.release()
continue
else:
self.currency_limit = False
self.thread_lock.release()
else:
try:
url = self.queue.get()
except:
continue
self.thread_id = self.thread_id+1
thd = Handler(url, self, self.thread_id)
self.thread_lock.acquire()
self.thread_pool[self.thread_id] = thd
if len(self.thread_pool) == self.strategy.concurrency:
self.currency_limit = True
self.thread_lock.release()
self.thread_num = self.thread_num+1
print "add thread ", self.thread_id
thd.start()
self.handler_num = self.handler_num+1
if self.strategy.max_count <= self.handler_num:
print "handler num %d is full so stop " % self.handler_num
self.is_stop = True
def remove_thread(self, thd_id):
self.thread_lock.acquire()
if thd_id in self.thread_pool:
del self.thread_pool[thd_id]
print "del threadid ", thd_id
self.thread_lock.release()
def run(self):
self._run_loop()
class Handler(Thread):
def __init__(self, urlobj, spider, thd_id):
Thread.__init__(self)
print "begin thread %d with url %s" %(thd_id, urlobj.url)
self.urlobj= urlobj
self.spider = spider
self.thread_id = thd_id
self.charset = "utf-8"
def run(self):
try :
html = self.open(self.urlobj.url)
except Exception,why:
return
depth = self.urlobj.depth + 1
if depth > self.spider.strategy.max_depth:
return
for link in self.feed(html):
if hash(link) in self.spider.url_set:
continue
url = UrlObject(link, depth)
self.spider.put(url)
self.spider.remove_thread(self.thread_id)
def open(self, url):
strategy = self.spider.strategy
try:
resp = requests.get(url, timeout=strategy.timeout)
except:
return
if resp.status_code != requests.codes.ok:
return
charset = HtmlAnalyzer.detectCharSet(resp.text)
if charset is not None:
self.charset = charset
resp.encoding = charset
return resp.text
def feed(self, html):
return HtmlAnalyzer.extractLinks(html, self.urlobj.url, self.charset)
class MySpider(object):
def __init__(self, max_depth, max_count, root_url):
self.spider = ThreadSpider(max_depth=max_depth,
max_count=max_count,
root_url=root_url)
def run(self):
self.spider.run()
test = MySpider(max_depth=5, max_count=100, root_url="http://www.maiziedu.com")
test.run()
thread实现爬虫
1.在一个线程内执行爬虫动作;
2.操作全局数据时注意线程安全。
对比
1.线程操作复杂,线程安全安全问题非常繁琐,容易出问题,而去不易掌握,甚至出现死锁,很难调试;
2.gevent模式相对简单,控制并发、调试都容易。
原文链接:http://www.maiziedu.com/wiki/frame/creeper/