14-高并发爬虫aiohttp

最新推荐文章于 2024-04-07 20:29:04 发布

# JFZero

最新推荐文章于 2024-04-07 20:29:04 发布

阅读量668

点赞数

分类专栏： python高级文章标签：爬虫 python 开发语言

本文链接：https://blog.csdn.net/weixin_50348308/article/details/123054101

版权

python高级专栏收录该内容

15 篇文章 0 订阅

订阅专栏

# asyncio是没有实现request异步的,没有涉及http协议
# 没必要自己实现http协议，可以直接使用aiohttp
# aiohttp已实现高并发的webserver
# sanic的高性能是号称媲美go语言的，实现了高并发的web服务器

# 使用aiohttp的client端实现爬虫
# asyncio爬虫，去重、入库（用异步驱动完成数据库的入库，使用aiomysql）

# 爬取目标：www.jobbole.com
# 爬取策略：获取页面中的所有URL，判断是否为文章详情页
import aiohttp
import asyncio
import re
from pyquery import PyQuery
import aiomysql
from pyquery import PyQuery
# http://www.lfd.uci.edu/~gohlke/pythonlibs/  # 当python安装各种库有问题时，课用

start_url = "http://www.jobbole.com"
waiting_urls = [] # 可以用list，也可以用queue
seen_urls = set() # 已爬取的url，如果有上亿条数据，就不适合用set了
stopping = False # 设置变量stopping作为事件循环的控制
# 做3个并发
sem = asyncio.Semaphore(3)
# 从服务器返回html
async def fetch(url,session):

    # 由于并发比较高，所以不要每次获取数据都要建立连接，可以使用同一个session，通过传参的方式就好
    # async with aiohttp.ClientSession() as session:
    async with sem:
        await asyncio.sleep(1)
        try:
            async with session.get(url) as resp:
                # 获取状态码进行判断
                print("url status:{}".format(resp.status))
                if resp.status in [200,201]:
                    data = await resp.text()
                    return resp.text
        except Exception as e:
            print(e)
# 实现爬取策略，解析获取可爬取的url
def extract_urls(html):
    urls = []
    pq = PyQuery(html)
    for i in pq.items("a"):
        url = link.attr("href")
        if url and url.startswith(http) and url not in seen_urls:
            urls.append(url)
            waitting_urls.append(url)
    return urls

# 异步获取可爬取的url
async def init_urls(url,session):
    html = await fetch(url,session)
    seen_urls.add(url)
    # 无需获取返回，因为在extract_urls中，已将url加入到waitting_urls中了
    extract_urls(html)

async def article_handler(url,session,pool):
    # 获取文章详情并解析入库
    html = await fetch(url,session)
    extract_urls(html)

    pq = PyQuery(html)
    title = pq("title").text()
    # pool.acquire()是获取一个连接
    async with pool.acquire() as conn:
        async with conn.cursor() as cur:
            await cur.execute("SELECT 42;")
            # 可以使用navicat进行对数据库的操作(建库建表)
            insert_sql = "insert into article_test(title) values('{}')".format(title)
            await cur.execute(insert_sql)


# 消费者consumer:从waitting_urls中不停地爬取数据，取到数据就扔到协程asyncio中，
async def consumer(pool):
        while not stopping:
            # 当队列为空的时候，等待一下，否则pop时会报错
            if len(waitting_urls)==0:
                await asyncio.sleep(0.5)
                continue # 避免频繁发送请求
            # 否则，如果不在seen_urls中，
            url = waitting_urls.pop()
            print("start get url:{}".format(url))
            # 判断是否为详情页的url，如果是且不在seen_urls中，则对url进行文章提取解析
            if re.match('http://.*?jobbole.com/\d+/', url):
                if url not in seen_urls:
                    asyncio.ensure_future(article_handler(url, session, pool))
                    await asyncio.sleep(30)  # 避免发送过多请求
                else:
                    if url not in seen_urls:
                        asyncio.ensure_future(init_urls(url, session))
            else:
                if url not in seen_urls:
                    asyncio.ensure_future(init_urls(url))

async def main():

    # 等待mysql连接建立好，要设置chartset才能插入中文数据，autocommit也必须要设置才能提交数据
    pool = await aiomysql.create_pool(host='127.0.0.1',port=3306,
                                      user='root',passwor='',db='aiomysql_test',
                                      loop=loop,charset="utf8",autocommit=True)
    # 由于async with在创建完session后，会自动调用close()将session关闭，因此可以在前期就创建好
    async with aiohttp.ClientSession() as session:
        html = await fetch(start_url,session)
        seen_urls.add(start_url)
        # 无需获取返回，因为在extract_urls中，已将url加入到waitting_urls中了
        extract_urls(html)
    # asyncio.ensure_future(init_urls(start_url))
    aysncio.ensure_future(consumer(pool))

if __name__ == "__main__":
    loop = asyncio.get_event_loop()
    asyncio.ensure_future(main(loop))
    loop.run_forever()

# JFZero

关注

0
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
14-高并发爬虫aiohttp

# asyncio是没有实现request异步的,没有涉及http协议# 没必要自己实现http协议，可以直接使用aiohttp# aiohttp已实现高并发的webserver# sanic的高性能是号称媲美go语言的，实现了高并发的web服务器# 使用aiohttp的client端实现爬虫# asyncio爬虫，去重、入库（用异步驱动完成数据库的入库，使用aiomysql）# 爬取目标：www.jobbole.com# 爬取策略：获取页面中的所有URL，判断是否为文章详情页impor
复制链接

扫一扫