node获取网页指定内容（标题、摘要、图片）

最新推荐文章于 2023-08-04 10:47:53 发布

darling HQ

最新推荐文章于 2023-08-04 10:47:53 发布

阅读量845

点赞数

分类专栏：问题分享文章标签： node.js 后端

本文链接：https://blog.csdn.net/qq_55761697/article/details/129042192

版权

问题分享专栏收录该内容

1 篇文章 0 订阅

订阅专栏

首先有node，然后是引入模块，这是必备

fs模块写入文件
path模块定义文件路径
request模块定义发送请求（requests可能要好点，各位如果需要改的自行百度）
cheerio模块定义内容加载成DOM（个人理解）
（不需要把内容转换成文本的可以不需要fs，psth模块）

这里我是需要网页中meta标签中的值

代码中有指定的名称，根据名称获取指定标签值

let fs = require('fs') // 读写文件
let path = require('path') // 定义文件路径
var request = require('request');
var cheerio = require('cheerio');
var options = {
    'url': 'https://mp.weixin.qq.com/s/P8q3CjZdH-GCtB2VHVy_qg',
    'headers': {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    }
};

request(options, function(error, response, body) {
    if (error) throw new Error(error);
    // console.log(body);
    // fs.writeFile(path.resolve(__dirname, 'index.html'), body, () => { //将请求得到的资源文件写入本地项目文件夹下的index.html（名字可改）中
    //     console.log("保存成功") // 数据爬取成功，输出“保存成功”
    // })
    getdom(body)
});

function getdom(html) {
    var $ = cheerio.load(html);
    //后面就像用jq一样获取页面元素就可以了
    // var a = $('meta').slice(last, last - 8)
    var obj = {}
    var desc = $('[name=description]').attr('content')
    var title = $('[property=og:title]').attr('content')
    var img = $('[property=og:image]').attr('content')
        // console.log(desc);
        // console.log(title);
        // console.log(img);
    obj = {
        'desc': desc,
        'title': title,
        'img': img
    }
    console.log(obj);
    fs.writeFile(path.resolve(__dirname, 'data.html'),
        html // 转换JSON格式
        , () => {
            console.log("保存成功")
        })
}