Redis ，asyncio ，aiohttp和lxml爬虫实例

最新推荐文章于 2024-05-01 03:20:40 发布

迷心兔

最新推荐文章于 2024-05-01 03:20:40 发布

阅读量554

点赞数

迷心兔

本文链接：https://blog.csdn.net/mixintu/article/details/102556399

版权

Python Redis 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

新的启动事件循环

# coding=utf-8
import time
import redis
import asyncio
import logging
from lxml import etree
from contextlib import closing
from aiohttp import ClientSession

logging.basicConfig(
    level = logging.INFO,
    format = "%(asctime)s [*] %(message)s"
)
URL = "https://www.fabiaoqing.com/biaoqing/lists/page/{}.html"


# 提取图片标题，链接
def parse(source_, red_):
    try:
        response = etree.HTML(source_)
        tagbqppdiv = response.xpath('//div[@class="tagbqppdiv"]/a')
        for index, img in enumerate(tagbqppdiv):
            args = (int(index + 1), img.xpath('@title')[0], img.xpath('img/@data-original')[0])
            red_.hset("mixintu", args[1], args[2])
            logging.info('第 %d 张  标题：%r  链接：%r' % args)
    except Exception as e:
        print('解析详情页出错！')
        pass


# 获取网页源码
async def get_source(url_, red_):
    try:
        async with ClientSession( ) as session:
            async with session.get(url_) as response:
                source = await response.read( )
                parse(source, red_)
    except asyncio.CancelledError:
        raise


async def main( ):
    now = time.time( )
    pool = redis.ConnectionPool(host = 'localhost', port = 6379, db = 1)
    red = redis.Redis(connection_pool = pool)
    tasks = [asyncio.ensure_future(get_source(URL.format(i), red))
             for i in range(1, 11)]
    # 第一种运行方式-----------------------------------
    dones, pendings = await asyncio.wait(tasks)
    # 已完成的协程-------------------------------------
    # for done in dones:
    #     logging.info(f"已完成的协程：{done.result( )}")
    # 超时未完成的协程---------------------------------
    # for pending in pendings:
    #     logging.info(f"超时未完成的协程  ：{pending}")
    #     pending.cancel( )

    logging.info(f"总用时:{time.time( ) - now}")


if __name__ == "__main__":
    # 修改了这里
    with closing(asyncio.get_event_loop( )) as loop:
        loop.run_until_complete(main( ))

旧的启动事件循环

# coding=utf-8
import asyncio
import logging
import time

import redis
from aiohttp import ClientSession
from lxml import etree

logging.basicConfig(
    level = logging.INFO,
    format = "%(asctime)s [*] %(message)s"
)
URL = "https://www.fabiaoqing.com/biaoqing/lists/page/{}.html"


# 提取图片标题，链接
def parse(source_, red_):
    try:
        response = etree.HTML(source_)
        tagbqppdiv = response.xpath('//div[@class="tagbqppdiv"]/a')
        for index, img in enumerate(tagbqppdiv):
            args = (int(index + 1), img.xpath('@title')[0], img.xpath('img/@data-original')[0])
            red_.hset("mixintu", args[1], args[2])
            logging.info('第 %d 张  标题：%r  链接：%r' % args)
    except Exception as e:
        print('解析详情页出错！')
        pass


# 获取网页源码
async def get_source(url_, red_):
    try:
        async with ClientSession( ) as session:
            async with session.get(url_) as response:
                source = await response.read( )
                parse(source, red_)
    except asyncio.CancelledError:
        raise


async def main( ):
    now = time.time( )
    pool = redis.ConnectionPool(host = 'localhost', port = 6379, db = 1)
    red = redis.Redis(connection_pool = pool)
    tasks = [asyncio.ensure_future(get_source(URL.format(i), red))
             for i in range(1, 11)]
    # 第一种运行方式-----------------------------------
    dones, pendings = await asyncio.wait(tasks)
    # 已完成的协程-------------------------------------
    # for done in dones:
    #     logging.info(f"已完成的协程：{done.result( )}")
    # 超时未完成的协程---------------------------------
    # for pending in pendings:
    #     logging.info(f"超时未完成的协程  ：{pending}")
    #     pending.cancel( )

    logging.info(f"总用时:{time.time( ) - now}")


if __name__ == "__main__":
    event_loop = asyncio.get_event_loop( )
    # ------------------------------------------------
    try:
        # 用这个协程启动循环，协程返回时这个方法将停止循环。
        event_loop.run_until_complete(main( ))
    except KeyboardInterrupt:
        for task in asyncio.Task.all_tasks( ):
            print(task.cancel( ))
        event_loop.stop( )
    finally:
        event_loop.close( )

在这里插入图片描述

迷心兔

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
打赏
0
评论
Redis ，asyncio ，aiohttp和lxml爬虫实例

# coding=utf-8import asyncioimport loggingimport timeimport redisfrom aiohttp import ClientSessionfrom lxml import etreelogging.basicConfig( level = logging.INFO, format = "%(asctime)...
复制链接

扫一扫