多进程 多线程 异步 爬虫(2)

接上节多进程 多线程 异步 爬虫(1)

asyncio文档
asyncio具体特性

Lock(锁)

class AsyncAiohttp_Lock(AsyncAiohttp):
    def __init__(self):
        super(AsyncAiohttp_Lock,self).__init__()
        self._time_lock = LOCK_TIME

    def unlock(self,lock):
        lock.release()

    async def url_dict_onepage_async(self,*agrs):
        with await agrs[1]:
            await super(AsyncAiohttp_Lock,self).url_dict_onepage_async( agrs[0])

    async def tasks_lock(self,loop,urls):
        #创建一个锁
        lock = asyncio.Lock()
        #加锁
        await lock.acquire()
        #loop.call_later(delay, func, *args)延迟delay秒之后再执行。
        #loop.call_at(when, func, *args) 某个时刻才执行。
        #loop.call_soon( func, *args)顺序执行

        #延迟self._time_lock 时间后执行 解锁
        #事件循环变慢
        loop.call_later(self._time_lock,functools.partial(self.unlock,lock))
        await asyncio.wait([self.url_dict_onepage_async(url,lock) for url in urls])

    def page_ELoop(self, urls):
        # 获取EventLoop
        loop = asyncio.get_event_loop()
        # tasks = [self.url_dict_onepage_async(url) for url in urls]
        # 执行coroutine
        loop.run_until_complete(self.tasks_lock(loop,urls))
        loop.close()

Event(事件)

'''
Event(事件)
Class implementing event objects. An event manages a flag that can be set to true with the set() method
and reset to false with the clear() method. The wait() method blocks until the flag is true.
The flag is initially false.

clear()
Reset the internal flag to false. Subsequently, coroutines calling wait() will block
until set() is called to set the internal flag to true again.

is_set()
Return True if and only if the internal flag is true.

set()
Set the internal flag to true. All coroutines waiting for it to become true are awakened.
Coroutine that call wait() once the flag is true will not block at all.

coroutine wait()
Block until the internal flag is true.

If the internal flag is true on entry, return True immediately. Otherwise,
block until another coroutine calls set() to set the flag to true, then return True.

This method is a coroutine.

'''
class AsyncAiohttp_Event(AsyncAiohttp):
    def __init__(self):
        super(AsyncAiohttp_Event,self).__init__()
        self._time_event = EVENT_TIME

    async def url_dict_onepage_async(self,*agrs):
        print('{} waiting for event'.format(agrs[0]))
        await agrs[1].wait()
        await super(AsyncAiohttp_Event,self).url_dict_onepage_async( agrs[0])
        print('{} triggered'.format(agrs[0]))

    def set_event(self,event):
        print('setting event in callback')
        event.set()

    async def tasks_event(self, loop, urls):
        event = asyncio.Event()
        print('event start state:{}'.format((event.is_set())))

        #延迟执行,模拟锁
loop.call_later(self._time_event,functools.partial(self.set_event, event))
        test_event = [self.url_dict_onepage_async(url,event) for url in urls]
        await asyncio.wait(test_event)
        print('event end state: {}'.format(event.is_set()))

    def page_ELoop(self, urls):
        # 获取EventLoop
        loop = asyncio.get_event_loop()
        # tasks = [self.url_dict_onepage_async(url) for url in urls]
        # print(tasks)
        # 执行coroutine
        loop.run_until_complete(self.tasks_event(loop,urls))
        loop.close()

Semaphore(信号量)

'''
限制一次性请求数量
控制并发量
'''

class AsyncAiohttp_Semaphore(AsyncAiohttp):
    def __init__(self):
        super(AsyncAiohttp,self).__init__()
        self._sema = SEMA_NUM

    '''
    或者复写方法
    '''
    async def url_dict_onepage_async(self, agrs):
        with await asyncio.Semaphore(self._sema):
            await super(AsyncAiohttp_Semaphore,self).url_dict_onepage_async(agrs)
'''
为了不修改之前代码可用,可以采用装饰器
带参数装饰器,修辞 url_dict_onepage_async()函数添加带信号量
'''
def asyncio_Semaphore(sema):
    def decorator(func):
        @functools.wraps(func)
        async def wrapper(*args, **kwargs):
            with await asyncio.Semaphore(sema):
                await func(*args, **kwargs)
        return wrapper
    return decorator

Queue 队列


'''
前几个方案都是用两个loop 一个负责获取多个页面图片url并存入数据库,从数据库获取图片url 开启另一个loop 去下载图片
采用 Queue 
urls -> queue -> download
'''
class AsyncAiohttp_Queue(AsyncAiohttp):
    def __init__(self):
        super(AsyncAiohttp_Queue,self).__init__()

    #consume
    async def file_download(self,queue):
        while True:
            print('[consume]wait for an pagedict from the producer')
            pagedict = await queue.get()
            print('[consume] get pagedict from queue {} '.format(pagedict))
            urls = pagedict.get('urls')
            #
            for url in urls:
                # print('[consume] download {}'.format(url))
                await self.file_download_async(url)
            print('[consume] Notify the queue that the pagedict has been processed')
            queue.task_done()

    #produce
    async def url_dict_onepage_async(self,*agrs):
        queue, urls = agrs[0],agrs[1]
        for url in urls:
            pagedict = {}
            print('[produce]getting img_urls from pageurl {}'.format(url))
            num = url[0]
            page = url[1]
            async with aiohttp.request('GET', page) as r:
                data = await r.read()
            data = data.decode()
            html = etree.HTML(data)
            pic_lis = html.xpath('//ol//li//img/@src')
            git_lis = html.xpath('//ol//li//img/@org_src')
            git_lis.extend(pic_lis)
            pagedict['page'] = num
            pagedict['urls'] = git_lis
            # put the item in the queue
            print('[produce]put the pagedict in the queue {}'.format(url))
            await queue.put(pagedict)

    async def tasks_queue(self, loop, urls):
        queue = asyncio.Queue()
        '''
        类似Task
        '''
        # schedule the consumer  create Task
        file_download =  asyncio.ensure_future(self.file_download(queue))
        # run the producer and wait for completion
        await self.url_dict_onepage_async(queue,urls)
        # wait until the consumer has processed all items
        await queue.join()
        # the consumer is still awaiting for an item, cancel it
        file_download.cancel()

    def queue_ELoop(self, urls):
        # 获取EventLoop
        loop = asyncio.get_event_loop()
        # tasks = [self.url_dict_onepage_async(url) for url in urls]
        # print(tasks)
        # 执行coroutine
        loop.run_until_complete(self.tasks_queue(loop,urls))
        loop.close()

    def Execute(self):
        page_list = self.page_list()
        # print(page_list)
        self.queue_ELoop(page_list)
if __name__ == '__main__':
    start = time.time()
    # AA = ProcessPE()

    # AA = ThreadPE()

    # AA = Async_only()

    # AA = AsyncAiohttp()
    # AA = AsyncAiohttp_Lock()
    # AA = AsyncAiohttp_Semaphore()

    AA = AsyncAiohttp_Queue()

    AA.Execute()
    print('[endtime][{}]'.format(time.time() - start))
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值