使用ruia爬取 存入mongodb

本文介绍如何使用Ruia爬虫框架抓取Ruan Yi Feng博客的数据,并通过Motor库将数据存入MongoDB数据库,实现数据持久化。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

使用ruia 来爬取数据 存入mongodb数据库

在入库之前先启动mongodb

service mongodb start
mongo
exit

然后开始进入操作

cd ~/Code/monkey/monkey
mkdir config && cd config
vim config.py

将代码引入mongodb的配置

import os

class Config:
    BASE_DIR = os.path.dirname(os.path.dirname(__file__))
    MONGODB = dict(
        MONGO_HOST=os.getenv('MONGO_HOST', ""),
        MONGO_PORT=int(os.getenv('MONGO_PORT', 27017)),
        MONGO_USERNAME=os.getenv('MONGO_USERNAME', ""),
        MONGO_PASSWORD=os.getenv('MONGO_PASSWORD', ""),
        DATABASE='monkey',
    )

为了方便被其他模块引入,创建文件
monkey/config/init.py,输入:

from .config import Config

操作mongodb的第三方包选用的motor,先编写一个类建立对mongodb的连接

cd ~/Code/monkey/monkey
# 安装motor(实验环境中默认已经安装)
pipenv install motor
mkdir database && cd database
vim motor_base.py

进入这个py文件

import asyncio

from motor.motor_asyncio import AsyncIOMotorClient

from monkey.config import Config
from monkey.utils.tools import singleton


@singleton
class MotorBase:
    """
    About motor's doc: https://github.com/mongodb/motor
    """
    _db = {}
    _collection = {}
    MONGODB = Config.MONGODB

    def __init__(self, loop=None):
        self.motor_uri = ''
        self.loop = loop or asyncio.get_event_loop()

    def client(self, db):
        # motor
        self.motor_uri = 'mongodb://{account}{host}:{port}/{database}'.format(
            account='{username}:{password}@'.format(
                username=self.MONGODB['MONGO_USERNAME'],
                password=self.MONGODB['MONGO_PASSWORD']) if self.MONGODB['MONGO_USERNAME'] else '',
            host=self.MONGODB['MONGO_HOST'] if self.MONGODB['MONGO_HOST'] else 'localhost',
            port=self.MONGODB['MONGO_PORT'] if self.MONGODB['MONGO_PORT'] else 27017,
            database=db)
        return AsyncIOMotorClient(self.motor_uri, io_loop=self.loop)

    def get_db(self, db=MONGODB['DATABASE']):
        """
        Get a db instance
        :param db: database name
        :return: the motor db instance
        """
        if db not in self._db:
            self._db[db] = self.client(db)[db]

        return self._db[db]

    def get_collection(self, db_name, collection):
        """
        Get a collection instance
        :param db_name: database name
        :param collection: collection name
        :return: the motor collection instance
        """
        collection_key = db_name + collection
        if collection_key not in self._collection:
            self._collection[collection_key] = self.get_db(db_name)[collection]

        return self._collection[collection_key]

程序中from monkey.utils.tools import singleton这一行的目的是启动单例模式,防止重复初始化MongoDB导致占用过多资源:

cd ~/Code/monkey/monkey
mkdir utils && cd utils
vim tools.py

进入py文件

from functools import wraps


def singleton(cls):
    """
    A singleton created by using decorator
    :param cls: cls
    :return: instance
    """
    _instances = {}

    @wraps(cls)
    def instance(*args, **kw):
        if cls not in _instances:
            _instances[cls] = cls(*args, **kw)
        return _instances[cls]

    return instance

顺便加入日志文件

cd ~/Code/monkey/monkey/utils
vim log.py

输入代码

import logging

logging_format = "[%(asctime)s] %(process)d-%(levelname)s "
logging_format += "%(module)s::%(funcName)s():l%(lineno)d: "
logging_format += "%(message)s"

logging.basicConfig(
    format=logging_format,
    level=logging.DEBUG
)
logger = logging.getLogger()
现在,万事俱备,只要在上面一节的爬虫代码里面引用并操作数据库进行持久化,我们的第一个爬虫就正式完成了,先在终端创建爬虫文件
cd ~/Code/monkey/monkey
mkdir -p spider/sources && cd spider/sources
vim ruanyifeng_spider.py

写入py文件

import random
import sys

from ruia import AttrField, Item, Request, Spider, TextField
from ruia_ua import middleware

sys.path.append('/root/Code/monkey/')  #这个要给绝对路径 不然会报找不到路径

from monkey.database.motor_base import MotorBase


class ArchivesItem(Item):
    """
    eg: http://www.ruanyifeng.com/blog/archives.html
    """
    target_item = TextField(css_select='div#beta-inner li.module-list-item')
    href = AttrField(css_select='li.module-list-item>a', attr='href')


class ArticleListItem(Item):
    """
    eg: http://www.ruanyifeng.com/blog/essays/
    """
    target_item = TextField(css_select='div#alpha-inner li.module-list-item')
    title = TextField(css_select='li.module-list-item>a')
    href = AttrField(css_select='li.module-list-item>a', attr='href')


class BlogSpider(Spider):
    """
    针对博客源 http://www.ruanyifeng.com/blog/archives.html 的爬虫
    这里为了模拟ua,引入了一个ruia的第三方扩展
        - ruia-ua: https://github.com/ruia-plugins/ruia-ua
        - pipenv install ruia-ua
        - 此扩展会自动为每一次请求随机添加 User-Agent
    """
    # 设置启动URL
    start_urls = ['http://www.ruanyifeng.com/blog/archives.html']
    # 爬虫模拟请求的配置参数
    request_config = {
        'RETRIES': 3,
        'DELAY': 0,
        'TIMEOUT': 20
    }
    # 请求信号量
    concurrency = 10
    blog_nums = 0

    async def parse(self, res):
        items = await ArchivesItem.get_items(html=res.html)
        self.mongo_db = MotorBase(loop=self.loop).get_db()
        for item in items:
            # 随机休眠
            self.request_config['DELAY'] = random.randint(5, 10)
            yield Request(
                item.href,
                callback=self.parse_item,
                request_config=self.request_config
            )

    async def parse_item(self, res):
        items = await ArticleListItem.get_items(html=res.html)
        for item in items:
            # 已经抓取的链接不再请求
            is_exist = await self.mongo_db.source_docs.find_one({'url': item.href})
            if not is_exist:
                # 随机休眠
                self.request_config['DELAY'] = random.randint(5, 10)
                yield Request(
                    item.href,
                    callback=self.save,
                    metadata={'title': item.title},
                    request_config=self.request_config
                )

    async def save(self, res):
        # 好像有两个url一样 原本的博客总数1725 在入库后变成了1723
        data = {
            'url': res.url,
            'title': res.metadata['title'],
            'html': res.html
        }

        try:
            await self.mongo_db.source_docs.update_one({
                'url': data['url']},
                {'$set': data},
                upsert=True)
        except Exception as e:
            self.logger.exception(e)


def main():
    BlogSpider.start(middleware=middleware)

if __name__ == '__main__':
    main()

接下来就运行代码

cd ~/Code/monkey
# 运行前请进入项目虚拟环境:pipenv shell
python monkey/spider/sources/ruanyifeng_spider.py
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值