node爬虫爬取网站图片

最新推荐文章于 2023-12-03 23:25:06 发布

最帅扫地僧

最新推荐文章于 2023-12-03 23:25:06 发布

阅读量438

点赞数 1

分类专栏： nodeJS 文章标签： nodejs

本文链接：https://blog.csdn.net/weixin_42259266/article/details/104119222

版权

nodeJS 专栏收录该内容

10 篇文章 0 订阅

订阅专栏

// 爬虫
const https = require('https')
const fs = require('fs')
const cheerio = require('cheerio')
const url = require('url')

const site = 'https://www.xxx.com/'
const saveFoldersPath = './images'

https.get(site, res => {
  let err = ''
  if (res.statusCode != 200) {
    err = new Error('请求失败')
  }
  // 判断请求格式是否是网页
  if (!/^text\/html/.test(res.headers['content-type'])) { // 格式不是网页
    err = new Error('请求格式不是网页')
  }

  if (err) {
    res.resume() // 重置缓存
    return
  }

  // 判断images文件夹是否存在
  fs.exists(saveFoldersPath, (cb) => {
    !cb && fs.mkdir(saveFoldersPath, () => {})
  })

  let str = ''
  // 数据是分段的
  res.on('data', chunk => {
    str += chunk
  })
  // 数据获取完成
  res.on('end', () => {
    // fs.writeFileSync('./bili.html', str)
    let $ = cheerio.load(str)
    $('img').each((index, el) => {
      let imgSrc = $(el).attr('src')
      // 此时需要做处理
      if (!imgSrc.startsWith('http://') && !imgSrc.startsWith('https://')) {
        imgSrc = url.parse(site).protocol + imgSrc
      }
      downLoad(imgSrc)
    })
  })
}).on('error', (e) => {
  console.error(e);
});

function downLoad(imgUrl) {
  https.get(imgUrl, (res) => {
    // 必须要设置，如果不设置，下载的图片可能打不开
    res.setEncoding('binary')
    let imgData = ''
    res.on('data', (chunk) => {
      imgData += chunk
    })
    res.on('end', () => {
      // 获取文件信息
      let files = parseFile(imgUrl)
      // 过滤掉不是.jpg|.png|.gif结尾的imgSrc
      if (files[1] == 'jpg' || files[1] == 'JPG' || files[1] == 'png' || files[1] == 'PNG' || files[1] == 'GIF' || files[1] == 'gif') {
        // 下载图片
        fs.writeFile(`${saveFoldersPath}/${files.join('.')}`, imgData, 'binary', (err) => {
          if (err) {
            console.log('dowload fail', err)
          } else {
            console.log('download success')
          }
        })
      }
    })
  })
}

function parseFile(imgUrl) {
  let urls = imgUrl.split('/')
  let fileName = urls[urls.length - 1]
  return fileName.split('.')
}

最帅扫地僧

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
node爬虫爬取网站图片

// 爬虫const https = require('https')const fs = require('fs')const cheerio = require('cheerio')const url = require('url')const site = 'https://www.xxx.com/'const saveFoldersPath = './images'htt...
复制链接

扫一扫