asyncio aiohttp 完成爬虫

最新推荐文章于 2023-10-13 09:48:13 发布

ccczhi

最新推荐文章于 2023-10-13 09:48:13 发布

阅读量280

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/ccczhi/article/details/106845737

版权

python 专栏收录该内容

20 篇文章 0 订阅

订阅专栏

代码:

# -*- coding:utf-8-*-
# asycio爬虫, 去重, 入库:
import asyncio
import re
import aiohttp
from pyquery import PyQuery
import aiomysql
from urllib.parse import urljoin, urlparse
import re
import requests
loop = asyncio.get_event_loop()
start_url = "https://news.cnblogs.com/"
waitting_urls = []
seen_urls = set()
stopping = False
sem = asyncio.Semaphore(1)# 控制并发

async def fetch(url, session):
    async with sem:
        # await asyncio.sleep(0.1)
        try:
            async with session.get(url, ssl=False) as resp:
                print("url start:{}".format(resp.status))
                if resp.status in [200, 201]:
                    data = await resp.text()
                    return data
        except Exception as e:
            print(e)


# async def init_url(session):
#     html = await fetch(start_url, session)
#     extract_url(html)


async def new_handle(url, session):
    html = await fetch(url, session)
    pq = PyQuery(html)
    title = pq("title").text().replace("新闻_博客园", "")
    seen_urls.add(url)
    print(title)


def extract_url(html):
    urls = []
    pq = PyQuery(html)
    for link in pq.items('a'):
        url = link.attr("href")
        url = urljoin(start_url, url)
        if url and url.startswith("https:") and url not in seen_urls and re.match("https://news.cnblogs.com/n/\d+/", url):
            urls.append(url)
            waitting_urls.append(url)
    print(waitting_urls)
    print("seen_urls",seen_urls)
    print("********")

async def consumer():
    async with aiohttp.ClientSession() as session:
        while not stopping:
            if len(waitting_urls) == 0:
                asyncio.sleep(0.2)
            try:
                url = waitting_urls.pop()
                print("start get url:{}".format(url))
                if re.match("https://news.cnblogs.com/n/\d+/", url):
                    if url not in seen_urls:
                        asyncio.ensure_future(new_handle(url, session))
            except Exception as e:
                await asyncio.sleep(0.1)
                print("************************")

async def main(loop):
    # 等待mysql连接建立好
    # pool = await aiomysql.create_pool(
    #     host='127.0.0.1', port=3306,
    #     user='root', password='', db='mysql',
    #     loop=loop, charset="utf8", autocommit=True
    # )
    async with aiohttp.ClientSession() as session1:
        html = await fetch(start_url, session1)
        seen_urls.add(start_url)
        extract_url(html)

    asyncio.ensure_future(consumer())


if __name__ == '__main__':
   loop = asyncio.get_event_loop()
   asyncio.ensure_future(main(loop))
   loop.run_forever()

ccczhi

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
asyncio aiohttp 完成爬虫

代码:# -*- coding:utf-8-*-# asycio爬虫, 去重, 入库:import asyncioimport reimport aiohttpfrom pyquery import PyQueryimport aiomysqlfrom urllib.parse import urljoin, urlparseimport reimport requestsloop = asyncio.get_event_loop()start_url = "https://news
复制链接

扫一扫

专栏目录