下载微信公众号文章里所有的图片（支持多线程）

最新推荐文章于 2024-09-15 23:39:19 发布

坐看云起时_雨宣

最新推荐文章于 2024-09-15 23:39:19 发布

阅读量910

点赞数 1

分类专栏：爬虫系列文章标签： Jsoup 下载微信公众号图片

本文链接：https://blog.csdn.net/qq_24434671/article/details/103962128

版权

爬虫系列专栏收录该内容

2 篇文章 0 订阅

订阅专栏

一般我们保存微信公众号的图片时一张一张的另存为如果图片少这种方式还可以，如果有十几张二十多张这种方式就不合适了，为此特地写了一段代码，下面贴出来：

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 微信文章图片下载
 * @author yuxuan
 *
 */
public class WxArticle implements Runnable {

	//微信公众号文章的URL
	private String baseUrl;
	//默认的下载目录
	private String downloadDir = System.getenv("user.name");
	//固定线程池大小，采用处理器核数的个数来确定
	private ExecutorService service =  Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());

	public void shutDown() {
		service.shutdown();
	}
	
	/**
	 * 构造函数
	 */
	public WxArticle() {
		mkdir(downloadDir);
	}
	
	/**
	 * 覆盖默认的下载路径
	 * @param downloadDir
	 */
	public WxArticle(String downloadDir) {
		mkdir(downloadDir);
	}
	
	/**
	 * 创建文件夹
	 * @param downloadDir
	 */
	private void mkdir(String downloadDir) {
		File file = new File(downloadDir);
		// 文件夹不存在则进行创建
		if (!file.exists()) {
			file.mkdirs();
		}
		System.out.println("work dir : " + file.getAbsolutePath());
	}
	
	public void setBaseUrl(String baseUrl) {
		this.baseUrl = baseUrl;
	}

	/**
	 * 执行返回 Document
	 * 
	 * @return
	 */
	private Document execute() {
		try {
			return Jsoup.parse(new URL(baseUrl), 30000);
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}

	/**
	 * 单线程下载
	 */
	public void spiderImgsDownLoad() {

		Document root = execute();
		if (root == null) {
			System.out.println("root is null");
			return;
		}

		int count = 0;
		Element ele = root.getElementById("page-content");
		Elements eles = ele.getElementsByTag("img");
		for (Element img : eles) {
			String src = "";
			if (img.hasAttr("src")) {
				src = img.attr("src");
			} else {
				src = img.attr("data-src");
			}
			String fileName = getFileNameBySuffix(src,count);
			src = urlAddHttpPrefix(src);
			if (src.length() > 0) {
				downLoadFromUrl(src, fileName, downloadDir);
				count++;
			}
		}
	}

	/**
	 * 多线程进行下载
	 */
	public void spiderImgsByMultipleThreadsDownLoad() {

		Document root = execute();
		if (root == null) {
			System.out.println("root is null");
			return;
		}

		int count = 0;
		Element ele = root.getElementById("page-content");
		Elements eles = ele.getElementsByTag("img");
		for (Element img : eles) {
			String src = "";
			if (img.hasAttr("src")) {
				src = img.attr("src");
			} else {
				src = img.attr("data-src");
			}
			String fileName = getFileNameBySuffix(src,count);
			src = urlAddHttpPrefix(src);
			if (src.length() > 0) {
				count++;
				service.submit(new DownLoader(src, downloadDir, fileName));
			}
		}
	}

	/**
	 * 添加HTTP前缀
	 * @param src
	 * @return
	 */
	private String urlAddHttpPrefix(String src) {
		if (src.startsWith("//")) {
			src = "http:" + src;
		}
		return src;
	}
	
	/**
	 * 确定文件后缀
	 * @param src
	 * @param count
	 * @return
	 */
	private String getFileNameBySuffix(String src,int count) {
		String fileName = String.format("%3d", count);
		if (src.toLowerCase().contains("gif")) {
			fileName = fileName.concat(".gif");
		} else if (src.toLowerCase().contains("jpg") || src.toLowerCase().contains("jpeg")) {
			fileName = fileName.concat(".jpg");
		} else if (src.toLowerCase().contains("png")) {
			fileName = fileName.concat(".png");
		} else {
			fileName = fileName.concat(".jpg");
		}
		return fileName;
	}
	
	/**
	 * 下载图片
	 * @param urlStr
	 * @param fileName
	 * @param savePath
	 */
	public static void downLoadFromUrl(String urlStr, String fileName, String savePath) {
		try {
			System.out.println(urlStr);
			URL url = new URL(urlStr);
			HttpURLConnection conn = (HttpURLConnection) url.openConnection();
			// 设置超时间为3秒
			conn.setConnectTimeout(3 * 1000);
			// 防止屏蔽程序抓取而返回403错误
			conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
			// 得到输入流
			InputStream inputStream = conn.getInputStream();
			
			byte[] buffer = new byte[1024];
			int len = 0;
			ByteArrayOutputStream bos = new ByteArrayOutputStream();
			while ((len = inputStream.read(buffer)) != -1) {
				bos.write(buffer, 0, len);
			}
			bos.close();
			 
			// 获取自己数组
			byte[] getData = bos.toByteArray();

			// 文件保存位置
			File saveDir = new File(savePath);
			if (!saveDir.exists()) {
				saveDir.mkdir();
			}
			File file = new File(saveDir + File.separator + fileName);
			FileOutputStream fos = new FileOutputStream(file);
			fos.write(getData);
			if (fos != null) {
				fos.close();
			}
			if (inputStream != null) {
				inputStream.close();
			}
			System.out.println(fileName+ ":" + url + " download success");
		} catch (Exception e) {
			e.printStackTrace();
			System.err.println(urlStr);
		}
	}


	/**
	 * 线程run方法
	 */
	@Override
	public void run() {
		spiderImgsDownLoad();
	}

	public void startThread() {
		Thread thread = new Thread(this);
		thread.start();
	}

	/**
	 * 创建一个内部类来进行多线程下载
	 * @author yuxuan
	 *
	 */
	class DownLoader implements Runnable {

		private String urlStr;
		private String savePath;
		private String fileName;

		public DownLoader(String urlStr, String savePath, String fileName) {
			this.urlStr = urlStr;
			this.savePath = savePath;
			this.fileName = fileName;
		}

		@Override
		public void run() {
			// 调用类的静态方法进行下载
			WxArticle.downLoadFromUrl(urlStr, fileName, savePath);
		}
	}

	public static void main(String[] args) {
		WxArticle wxArt = new WxArticle();
		wxArt.setBaseUrl("https://mp.weixin.qq.com/s/EZNLwWcm6kzD5pLwSlbpXw");
		//调用单线程下载方法
//		wxArt.spiderImgsDownLoad();
		//调用多线程下载方法
		wxArt.spiderImgsByMultipleThreadsDownLoad();
		//关闭线程池
		wxArt.shutDown(); 
	}

}

以上代码用到了Jsoup包，下面是maven的坐标：

<dependency>
	<groupId>org.jsoup</groupId>
	<artifactId>jsoup</artifactId>
	<version>1.12.1</version>
</dependency>

有问题可以在评论区留言，技术问题可以私信我。

坐看云起时_雨宣

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫

专栏目录