使用ruia 来爬取数据 存入mongodb数据库
在入库之前先启动mongodb
service mongodb start
mongo
exit
然后开始进入操作
cd ~/Code/monkey/monkey
mkdir config && cd config
vim config.py
将代码引入mongodb的配置
import os
class Config:
BASE_DIR = os.path.dirname(os.path.dirname(__file__))
MONGODB = dict(
MONGO_HOST=os.getenv('MONGO_HOST', ""),
MONGO_PORT=int(os.getenv('MONGO_PORT', 27017)),
MONGO_USERNAME=os.getenv('MONGO_USERNAME', ""),
MONGO_PASSWORD=os.getenv('MONGO_PASSWORD', ""),
DATABASE='monkey',
)
为了方便被其他模块引入,创建文件
monkey/config/init.py,输入:
from .config import Config
操作mongodb的第三方包选用的motor,先编写一个类建立对mongodb的连接
cd ~/Code/monkey/monkey
# 安装motor(实验环境中默认已经安装)
pipenv install motor
mkdir database && cd database
vim motor_base.py
进入这个py文件
import asyncio
from motor.motor_asyncio import AsyncIOMotorClient
from monkey.config import Config
from monkey.utils.tools import singleton
@singleton
class MotorBase:
"""
About motor's doc: https://github.com/mongodb/motor
"""
_db = {}
_collection = {}
MONGODB = Config.MONGODB
def __init__(self, loop=None):
self.motor_uri = ''
self.loop = loop or asyncio.get_event_loop()
def client(self, db):
# motor
self.motor_uri = 'mongodb://{account}{host}:{port}/{database}'.format(
account='{username}:{password}@'.format(
username=self.MONGODB['MONGO_USERNAME'],
password=self.MONGODB['MONGO_PASSWORD']) if self.MONGODB['MONGO_USERNAME'] else '',
host=self.MONGODB['MONGO_HOST'] if self.MONGODB['MONGO_HOST'] else 'localhost',
port=self.MONGODB['MONGO_PORT'] if self.MONGODB['MONGO_PORT'] else 27017,
database=db)
return AsyncIOMotorClient(self.motor_uri, io_loop=self.loop)
def get_db(self, db=MONGODB['DATABASE']):
"""
Get a db instance
:param db: database name
:return: the motor db instance
"""
if db not in self._db:
self._db[db] = self.client(db)[db]
return self._db[db]
def get_collection(self, db_name, collection):
"""
Get a collection instance
:param db_name: database name
:param collection: collection name
:return: the motor collection instance
"""
collection_key = db_name + collection
if collection_key not in self._collection:
self._collection[collection_key] = self.get_db(db_name)[collection]
return self._collection[collection_key]
程序中from monkey.utils.tools import singleton这一行的目的是启动单例模式,防止重复初始化MongoDB导致占用过多资源:
cd ~/Code/monkey/monkey
mkdir utils && cd utils
vim tools.py
进入py文件
from functools import wraps
def singleton(cls):
"""
A singleton created by using decorator
:param cls: cls
:return: instance
"""
_instances = {}
@wraps(cls)
def instance(*args, **kw):
if cls not in _instances:
_instances[cls] = cls(*args, **kw)
return _instances[cls]
return instance
顺便加入日志文件
cd ~/Code/monkey/monkey/utils
vim log.py
输入代码
import logging
logging_format = "[%(asctime)s] %(process)d-%(levelname)s "
logging_format += "%(module)s::%(funcName)s():l%(lineno)d: "
logging_format += "%(message)s"
logging.basicConfig(
format=logging_format,
level=logging.DEBUG
)
logger = logging.getLogger()
现在,万事俱备,只要在上面一节的爬虫代码里面引用并操作数据库进行持久化,我们的第一个爬虫就正式完成了,先在终端创建爬虫文件
cd ~/Code/monkey/monkey
mkdir -p spider/sources && cd spider/sources
vim ruanyifeng_spider.py
写入py文件
import random
import sys
from ruia import AttrField, Item, Request, Spider, TextField
from ruia_ua import middleware
sys.path.append('/root/Code/monkey/') #这个要给绝对路径 不然会报找不到路径
from monkey.database.motor_base import MotorBase
class ArchivesItem(Item):
"""
eg: http://www.ruanyifeng.com/blog/archives.html
"""
target_item = TextField(css_select='div#beta-inner li.module-list-item')
href = AttrField(css_select='li.module-list-item>a', attr='href')
class ArticleListItem(Item):
"""
eg: http://www.ruanyifeng.com/blog/essays/
"""
target_item = TextField(css_select='div#alpha-inner li.module-list-item')
title = TextField(css_select='li.module-list-item>a')
href = AttrField(css_select='li.module-list-item>a', attr='href')
class BlogSpider(Spider):
"""
针对博客源 http://www.ruanyifeng.com/blog/archives.html 的爬虫
这里为了模拟ua,引入了一个ruia的第三方扩展
- ruia-ua: https://github.com/ruia-plugins/ruia-ua
- pipenv install ruia-ua
- 此扩展会自动为每一次请求随机添加 User-Agent
"""
# 设置启动URL
start_urls = ['http://www.ruanyifeng.com/blog/archives.html']
# 爬虫模拟请求的配置参数
request_config = {
'RETRIES': 3,
'DELAY': 0,
'TIMEOUT': 20
}
# 请求信号量
concurrency = 10
blog_nums = 0
async def parse(self, res):
items = await ArchivesItem.get_items(html=res.html)
self.mongo_db = MotorBase(loop=self.loop).get_db()
for item in items:
# 随机休眠
self.request_config['DELAY'] = random.randint(5, 10)
yield Request(
item.href,
callback=self.parse_item,
request_config=self.request_config
)
async def parse_item(self, res):
items = await ArticleListItem.get_items(html=res.html)
for item in items:
# 已经抓取的链接不再请求
is_exist = await self.mongo_db.source_docs.find_one({'url': item.href})
if not is_exist:
# 随机休眠
self.request_config['DELAY'] = random.randint(5, 10)
yield Request(
item.href,
callback=self.save,
metadata={'title': item.title},
request_config=self.request_config
)
async def save(self, res):
# 好像有两个url一样 原本的博客总数1725 在入库后变成了1723
data = {
'url': res.url,
'title': res.metadata['title'],
'html': res.html
}
try:
await self.mongo_db.source_docs.update_one({
'url': data['url']},
{'$set': data},
upsert=True)
except Exception as e:
self.logger.exception(e)
def main():
BlogSpider.start(middleware=middleware)
if __name__ == '__main__':
main()
接下来就运行代码
cd ~/Code/monkey
# 运行前请进入项目虚拟环境:pipenv shell
python monkey/spider/sources/ruanyifeng_spider.py