NodeJS爬虫初探

最新推荐文章于 2024-07-26 08:57:17 发布

pengyuan.01

最新推荐文章于 2024-07-26 08:57:17 发布

阅读量552

点赞数

文章标签： nodejs爬虫 cheerio nodejs文件读写

本文链接：https://blog.csdn.net/cvchihzhza/article/details/80554142

版权

思路：

获取HTML字符串，将其转化为DOM，提取相应的文本信息

使用到了以下方法或库
var https = require('https');  //这里使用的是https协议,可视具体情况换为http协议
var fs = require("fs")
var cheerio = require('cheerio')

1.	获取HTML
function getHTML(URL, callback) {
  var originHTML = ''
  var req = https.request(URL, (res) => {
    console.log(`STATUS: ${res.statusCode}`);
    res.setEncoding('utf8');

    res.on('data', (chunk) => {
      originHTML += chunk
    });
    res.on('end', () => {
      console.log('No more data in response.');
      callback(originHTML)
    });
  });
  req.on('error', (e) => {
    console.error(`1 request's problem: ${e.message}`);
  });
  req.end();
}
2.	也可先将原始的HTML存到本地再用cheerio解析
读写文件分为两种操作，异步和同步
//读
function readFile(path,callback,type){
  if(type===""){
    fs.readFile(path,function(err,data){
      if(err) throw err
      callback(data)
    })
  }else if(type==="sync"){
    callback(fs.readFileSync(path))
  }else{
    console.log("请指定type类型.异步为'',同步为'sync'")
  }

}

//写
function writeFile(fileName,data,type){
  if(type===""){ //默认为异步
    fs.writeFile(fileName,data,function(err){
      if(err) throw err
    })

  }else if(type==="sync"){
    fs.writeFileSync(fileName,data)
  }else{
    console.log("请指定type类型.异步为'',同步为'sync'")
  }
}
3.	使用cheerio.js   （nodejs中的jQuery）
function HTML2DOM(HTML){
  var $=cheerio.load(HTML)
  var aAll=$("a")
  aAll.each(function(){
    var a=$(this)
    //do something here
  })
}