Gevent中爬虫与多线程详解

gevent实现爬虫并与普通多线程比较

1.爬虫目的与思路简介;

2.gevent实现爬虫;

3.多线程实现爬虫;

4.对比。

爬虫目的与设计

1.从一个主页爬取相关链接;

2.比如麦子学院主站;

3.http://www.maiziedu.com

4.从此网站爬取内容,分析过滤有效的不重复的网址(不包含图片,视频等资源地址);

5.但是为了控制结束条件,可以设置爬虫深入和总数目,并可以适当控制并发度。

gevent实现爬虫

  主要利用greenlet来执行具体爬虫操作,pool来控制并发度。

代码实例:

#!/usr/bin/env python

#-*- coding: utf-8 -*-

import re

import urllib

import urlparse

from pyquery import PyQuery

class HtmlAnalyzer(object):

@staticmethod

def detectCharSet(html):

pq = PyQuery(html)

metas = pq('head')('meta')

for meta in metas:

for key in meta.keys():

if key == "charset":

charset = meta.get('charset')

return charset

elif key == "content":

try:

content = meta.get('content')

p = re.match(r".+charset=(.*)\W*", content)

charset = p.group(1)

return charset

except:

continue

@staticmethod

def extractLinks(html, baseurl, charset):

def _extract(url, attr):

link = url.attrib[attr]

link = link.strip("/ ").strip('\\"')

if link is None:

raise

link = urlparse.urljoin(baseurl, link)

link = urlparse.urldefrag(link)[0]

try:

link = urllib.quote(link, ':?=+&#/@')

except (UnicodeDecodeError, KeyError):

try:

link = urllib.quote(link.encode(charset), ':?=+&#/@')

except:

pass

return link

def _isValidLink(url):

try:

return all([UrlFilter.checkScheme(url),

UrlFilter.checkInvalidChar(url),

UrlFilter.checkInvalidExtention(url)

])

except:

return False

pq = PyQuery(html)

allLinks = []

for url in pq('a'):

try:

link = _extract(url, 'href')

except:

continue

if _isValidLink(link):

allLinks.append(link)

for url in pq('form'):

try:

link = _extract(url, 'action')

except:

continue

if _isValidLink(link):

allLinks.append(link)

return allLinks

class UrlFilter(object):

invalid_chars = {'\'': None,

'\"': None,

'\\': None,

' ': None,

'\n': None,

'\r': None,

'+': None

}

invalid_extention = {

'jpg': None,

'gif': None,

'bmp': None,

'jpeg': None,

'png': None,

'swf': None,

'mp3': None,

'wma': None,

'wmv': None,

'wav': None,

'mid': None,

'ape': None,

'mpg': None,

'mpeg': None,

'rm': None,

'rmvb': None,

'avi': None,

'mkv': None,

'zip': None,

'rar': None,

'gz': None,

'iso': None,

'jar': None,

'doc': None,

'docx': None,

'ppt': None,

'pptx': None,

'chm': None,

'pdf': None,

'exe': None,

'msi': None,

}

@staticmethod

def checkScheme(url):

scheme, netloc, path, pm, q, f = urlparse.urlparse(url)

return scheme in ('http', 'https')

@classmethod

def checkInvalidChar(cls, url):

exist_invalid_char = False

for c in url:

if c in cls.invalid_chars:

exist_invalid_char = True

break

return (not exist_invalid_char)

@classmethod

def checkInvalidExtention(cls, url):

dotpos = url.rfind('.') + 1

typestr = url[dotpos:].lower()

return (typestr not in cls.invalid_extention)

Geventspider.py

import gevent

from gevent import (

monkey,

queue,

pool

)

import requests

from threading import Timer

from utils import HtmlAnalyzer

class Strategy(object):

def __init__(self, max_depth, max_count, concurrency=5):

self.max_depth = max_depth

self.max_count = max_count

self.concurrency = concurrency

self.timeout = 60

self.time = 12*60

class UrlObject(object):

def __init__(self, url, depth):

self.url = url.strip('/')

self.depth = depth

class GeventSpider(object):

def __init__(self, max_depth, max_count, root_url):

monkey.patch_all()

self.strategy = Strategy(max_depth=max_depth, max_count=max_count)

self.queue = queue.Queue()

self.pool = pool.Pool(self.strategy.concurrency)

self.url_set = set()

obj = UrlObject(root_url, 0)

self.put(obj)

self.handle_num = 0

def put(self, obj):

hash_val = hash(obj.url)

if hash_val not in self.url_set:

self.url_set.add(hash_val)

self.queue.put(obj)

def stop(self):

self.timer.cancel()

self.pool.join()

def _run_loop(self):

while self.timer.isAlive():

for geeenlet in list(self.pool):

if geeenlet.dead:

self.pool.discard(geeenlet)

try:

url = self.queue.get()

except queue.Empty:

continue

greentlet = Handler(url, self)

self.pool.start(greentlet)

self.handle_num = self.handle_num + 1

if self.strategy.max_count <= self.handle_num:

print 'handlernum %d is full so stop' %self.handle_num

self.stop()

def run(self):

self.timer = Timer(self.strategy.time, self.stop)

self.timer.start()

self._run_loop()

class Handler(gevent.Greenlet):

def __init__(self, urlobj, spider):

print 'begin greenlet with url ', urlobj.url

gevent.Greenlet.__init__(self)

self.urlobj = urlobj

self.spider = spider

self.charset = "utf-8"

def open(self, url):

strategy = self.spider.strategy

try:

resp = requests.get(url, timeout = strategy.timeout)

except:

return

if resp.status_code != requests.codes.ok:

return

charset = HtmlAnalyzer.detectCharSet(resp.text)

if charset is not None:

self.charset = charset

resp.encoding = charset

return resp.text

def _run(self):

try :

html = self.open(self.urlobj.url)

except Exception,why:

return

depth = self.urlobj.depth + 1

if depth > self.spider.strategy.max_depth:

return

for link in self.feed(html):

if hash(link) in self.spider.url_set:

continue

url = UrlObject(link, depth)

self.spider.put(url)

def feed(self, html):

return HtmlAnalyzer.extractLinks(html, self.urlobj.url, self.charset)

class MySpider(object):

def __init__(self, max_depth, max_count, root_url):

self.spider = GeventSpider(max_depth=max_depth,

max_count=max_count,

root_url=root_url)

def run(self):

self.spider.run()

test = MySpider(max_depth=5, max_count=100, root_url="http://www.maiziedu.com")

test.run()

Threadspider.py

import requests

from threading import Timer, Lock, Thread

from utils import HtmlAnalyzer

from Queue import Queue

import time

class Strategy(object):

def __init__(self, max_depth, max_count, concurrency=5):

self.max_depth = max_depth

self.max_count = max_count

self.concurrency = concurrency

self.timeout = 60

self.time = 12*60

class UrlObject(object):

def __init__(self, url, depth):

self.url = url.strip('/')

self.depth = depth

class ThreadSpider(object):

def __init__(self, max_depth, max_count, root_url):

self.strategy = Strategy(max_depth, max_count)

self.queue = Queue()

self.url_set = set()

self.handler_num = 0

self.lock = Lock()

self.thread_lock = Lock()

self.thread_pool = {}

self.thread_id = 0

self.is_stop = False

self.thread_num = 0

self.currency_limit = False

self.last_data = None

obj = UrlObject(root_url, 0)

self.put(obj)

def put(self, obj):

hash_val = hash(obj.url)

self.lock.acquire()

res = hash_val in self.url_set

self.lock.release()

if res:

return

self.url_set.add(hash_val)

self.queue.put(obj)

def _run_loop(self):

while True:

if self.is_stop:

time.sleep(1)

continue

if self.currency_limit:

time.sleep(1)

self.thread_lock.acquire()

self.thread_num = len(self.thread_pool)

if self.thread_num == self.strategy.concurrency:

self.thread_lock.release()

continue

else:

self.currency_limit = False

self.thread_lock.release()

else:

try:

url = self.queue.get()

except:

continue

self.thread_id = self.thread_id+1

thd = Handler(url, self, self.thread_id)

self.thread_lock.acquire()

self.thread_pool[self.thread_id] = thd

if len(self.thread_pool) == self.strategy.concurrency:

self.currency_limit = True

self.thread_lock.release()

self.thread_num = self.thread_num+1

print "add thread ", self.thread_id

thd.start()

self.handler_num = self.handler_num+1

if self.strategy.max_count <= self.handler_num:

print "handler num %d is full so stop " % self.handler_num

self.is_stop = True

def remove_thread(self, thd_id):

self.thread_lock.acquire()

if thd_id in self.thread_pool:

del self.thread_pool[thd_id]

print "del threadid ", thd_id

self.thread_lock.release()

def run(self):

self._run_loop()

class Handler(Thread):

def __init__(self, urlobj, spider, thd_id):

Thread.__init__(self)

print "begin thread %d with url %s" %(thd_id, urlobj.url)

self.urlobj= urlobj

self.spider = spider

self.thread_id = thd_id

self.charset = "utf-8"

def run(self):

try :

html = self.open(self.urlobj.url)

except Exception,why:

return

depth = self.urlobj.depth + 1

if depth > self.spider.strategy.max_depth:

return

for link in self.feed(html):

if hash(link) in self.spider.url_set:

continue

url = UrlObject(link, depth)

self.spider.put(url)

self.spider.remove_thread(self.thread_id)

def open(self, url):

strategy = self.spider.strategy

try:

resp = requests.get(url, timeout=strategy.timeout)

except:

return

if resp.status_code != requests.codes.ok:

return

charset = HtmlAnalyzer.detectCharSet(resp.text)

if charset is not None:

self.charset = charset

resp.encoding = charset

return resp.text

def feed(self, html):

return HtmlAnalyzer.extractLinks(html, self.urlobj.url, self.charset)

class MySpider(object):

def __init__(self, max_depth, max_count, root_url):

self.spider = ThreadSpider(max_depth=max_depth,

max_count=max_count,

root_url=root_url)

def run(self):

self.spider.run()

test = MySpider(max_depth=5, max_count=100, root_url="http://www.maiziedu.com")

test.run()

thread实现爬虫

1.在一个线程内执行爬虫动作;

2.操作全局数据时注意线程安全。

对比

1.线程操作复杂,线程安全安全问题非常繁琐,容易出问题,而去不易掌握,甚至出现死锁,很难调试;

2.gevent模式相对简单,控制并发、调试都容易。

 

原文链接:http://www.maiziedu.com/wiki/frame/creeper/

 

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值