asyncio aiohttp 完成爬虫

代码:

# -*- coding:utf-8-*-
# asycio爬虫, 去重, 入库:
import asyncio
import re
import aiohttp
from pyquery import PyQuery
import aiomysql
from urllib.parse import urljoin, urlparse
import re
import requests
loop = asyncio.get_event_loop()
start_url = "https://news.cnblogs.com/"
waitting_urls = []
seen_urls = set()
stopping = False
sem = asyncio.Semaphore(1)# 控制并发

async def fetch(url, session):
    async with sem:
        # await asyncio.sleep(0.1)
        try:
            async with session.get(url, ssl=False) as resp:
                print("url start:{}".format(resp.status))
                if resp.status in [200, 201]:
                    data = await resp.text()
                    return data
        except Exception as e:
            print(e)


# async def init_url(session):
#     html = await fetch(start_url, session)
#     extract_url(html)


async def new_handle(url, session):
    html = await fetch(url, session)
    pq = PyQuery(html)
    title = pq("title").text().replace("新闻_博客园", "")
    seen_urls.add(url)
    print(title)


def extract_url(html):
    urls = []
    pq = PyQuery(html)
    for link in pq.items('a'):
        url = link.attr("href")
        url = urljoin(start_url, url)
        if url and url.startswith("https:") and url not in seen_urls and re.match("https://news.cnblogs.com/n/\d+/", url):
            urls.append(url)
            waitting_urls.append(url)
    print(waitting_urls)
    print("seen_urls",seen_urls)
    print("********")

async def consumer():
    async with aiohttp.ClientSession() as session:
        while not stopping:
            if len(waitting_urls) == 0:
                asyncio.sleep(0.2)
            try:
                url = waitting_urls.pop()
                print("start get url:{}".format(url))
                if re.match("https://news.cnblogs.com/n/\d+/", url):
                    if url not in seen_urls:
                        asyncio.ensure_future(new_handle(url, session))
            except Exception as e:
                await asyncio.sleep(0.1)
                print("************************")

async def main(loop):
    # 等待mysql连接建立好
    # pool = await aiomysql.create_pool(
    #     host='127.0.0.1', port=3306,
    #     user='root', password='', db='mysql',
    #     loop=loop, charset="utf8", autocommit=True
    # )
    async with aiohttp.ClientSession() as session1:
        html = await fetch(start_url, session1)
        seen_urls.add(start_url)
        extract_url(html)

    asyncio.ensure_future(consumer())


if __name__ == '__main__':
   loop = asyncio.get_event_loop()
   asyncio.ensure_future(main(loop))
   loop.run_forever()

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值