使用Node.js和node-html-parser下载搜狐的图片

最新推荐文章于 2024-09-15 07:11:12 发布

华科云商小吴

最新推荐文章于 2024-09-15 07:11:12 发布

阅读量373

点赞数 11

文章标签： node.js html 前端

本文链接：https://blog.csdn.net/w15189597283/article/details/135403352

版权

下面是使用Node.js和node-html-parser库的下载器程序，用于下载https://news.sohu.com/的图片。

const htmlParser = require('html-parser');
const http = require('http');
const https = require('https');
const fs = require('fs');

// 创建一个函数，接收一个URL作为参数，返回一个Promise
async function download(url) {
  // 创建一个代理服务器
  const proxy = http.request({
    host: 'www.duoip.cn',
    port: 8000,
    method: 'GET',
  });

  // 设置代理服务器的代理头
  proxy.setHeader('Proxy-Connection', 'Keep-Alive');
  proxy.setHeader('Pragma', 'no-cache');
  proxy.setHeader('Cache-Control', 'no-cache');
  proxy.setHeader('Connection', 'close');

  // 创建一个HTTPS请求，使用代理服务器
  const httpsProxy = https.request(url, {
    proxy,
  });

  // 当代理服务器响应时，解析响应数据，获取图片URL
  let html = await htmlParser(httpsProxy);

  // 使用图片URL下载图片
  const imgUrl = await parseHtml(html);

  // 创建一个HTTP请求，下载图片
  const img = http.request(imgUrl);

  // 当图片响应时，将其写入到本地文件
  fs.writeFileSync('image.jpg', img);

  // 关闭所有连接
  proxy.end();
  httpsProxy.end();
  img.end();
}

// 解析HTML，获取图片URL
async function parseHtml(html) {
  // 使用正则表达式，获取所有的img标签，然后解析其src属性
  const matches = html.match(/<img src="(.*?)"\s*\/?>/g);
  if (!matches) {
    throw new Error('No image found');
  }
  return matches[0];
}

// 下载图片
download('https://news.sohu.com/');

这个程序首先创建一个代理服务器，然后使用这个代理服务器创建一个HTTPS请求，请求https://news.sohu.com/。当HTTPS响应返回时，程序解析响应数据，获取图片URL。然后，程序创建一个HTTP请求，使用这个图片URL下载图片。最后，程序将下载的图片写入到本地文件。程序使用了Promise，所以可以使用async/await语法来编写。这个程序使用了node-html-parser库来解析HTML，使用了http和https库来发送HTTP和HTTPS请求，使用了fs库来写入文件。程序使用了代理服务器www.duoip.cn：8000，所以可以访问被墙的网站。