eggjs定时任务实现网站监控爬虫_eggjs 每一分钟采集数据-CSDN博客

本文链接：https://blog.csdn.net/crazynzg/article/details/119981810

本文介绍了使用Egg.js框架创建定时任务的三种方式：通过 Subscription 类、直接导出对象以及匿名函数。示例中展示了如何设置任务间隔、抓取网页内容、使用Cheerio解析HTML并进行网站状态检查。此外，还展示了如何将抓取的数据传递给Controller并在EJS模板中展示。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

app/schedule

1.es6语法比较麻烦

const Subscription = require('egg').Subscription;

class UpdateCache extends Subscription {
  // 通过 schedule 属性来设置定时任务的执行间隔等配置
  static get schedule() {
    return {
      interval: '1m', // 1 分钟间隔
      type: 'all', // 指定所有的 worker 都需要执行
    };
  }

  // subscribe 是真正定时任务执行时被运行的函数
  async subscribe() {
    const res = await this.ctx.curl('http://www.api.com/cache', {
      dataType: 'json',
    });
    this.ctx.app.cache = res.data;
  }
}

module.exports = UpdateCache;

2.直接导出对象

module.exports = {
  schedule: {
    interval: '1m', // 1 分钟间隔
    type: 'all', // 指定所有的 worker 都需要执行
  },
  async task(ctx) {
    const res = await ctx.curl('http://www.api.com/cache', {
      dataType: 'json',
    });
    ctx.app.cache = res.data;
  },
};

3.直接导出匿名函数推荐

module.exports=app=>{
  return{
    schedule: {
        interval: '5s', // 1 分钟间隔
        type: 'all', // 指定所有的 worker 都需要执行
      },
      async task(ctx) {
        let result=await ctx.service.news.getNewsList();
        console.log(result);
      }
}
}

读取网络上的数据 curl 和抓紧网站上的内容cheerio

service读取网络上的数据

'use strict';

const Service = require('egg').Service;

class BaiduService extends Service {
    async requestUrl(url){
        return await this.ctx.curl(url);
    }
}
module.exports=BaiduService;

定时任务

const cheerio=require('cheerio');
module.exports=app=>{
    return{
        schedule: {
            interval: '30m', // 1 分钟间隔
            type: 'all'
        },
        async task(ctx){
              //1、抓取网站内容
              const url = "https://news.baidu.com/";
              let data=await ctx.service.baidu.requestUrl(url);
              let htmlData=data.data.toString();
               //2、解析数据 { decodeEntities: false } 防止中文乱码
               const $=cheerio.load(htmlData,{ decodeEntities: false });
                //通过标题判断是否一致判断是否挂掉
               if(($('title').html())!='百度新闻——海量中文资讯平台'){
                   console.log('网站挂掉或被篡改');
               }else{
                   console.log('正常');
               }
                //获取到了hotnews下面所有的a标签的内容
            let arr=[]
            // cheerio 语法和jQuery一致
            $('.hotnews li').each(function(){
                // console.log($(this).html());
                arr.push($(this).html());
            })
            // 把数据传给app对象中的spider 另一边通过this.app.spider 获取
            app.spider=arr;
        }
    }
}

controller 获取service的数据后渲染到ejs中

  async spider(){
    let data=this.app.spider;
    console.log(data);
    await this.ctx.render('spider',{data})
  }

ejs

 <ol>
        <%for(let x=0;x<data.length;x++){%>
            <li><%-data[x]%></li>
        <%}%>
    </ol>

eggjs定时任务实现网站监控 爬虫

eggjs定时任务实现网站监控爬虫