下载微信公众号文章里所有的图片(支持多线程)

一般我们保存微信公众号的图片时一张一张的另存为如果图片少这种方式还可以,如果有十几张二十多张这种方式就不合适了,为此特地写了一段代码,下面贴出来:

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 微信文章图片下载
 * @author yuxuan
 *
 */
public class WxArticle implements Runnable {

	//微信公众号文章的URL
	private String baseUrl;
	//默认的下载目录
	private String downloadDir = System.getenv("user.name");
	//固定线程池大小,采用处理器核数的个数来确定
	private ExecutorService service =  Executors.newFixedThreadPool(Runtime.getRuntime().availableProcessors());

	public void shutDown() {
		service.shutdown();
	}
	
	/**
	 * 构造函数
	 */
	public WxArticle() {
		mkdir(downloadDir);
	}
	
	/**
	 * 覆盖默认的下载路径
	 * @param downloadDir
	 */
	public WxArticle(String downloadDir) {
		mkdir(downloadDir);
	}
	
	/**
	 * 创建文件夹
	 * @param downloadDir
	 */
	private void mkdir(String downloadDir) {
		File file = new File(downloadDir);
		// 文件夹不存在则进行创建
		if (!file.exists()) {
			file.mkdirs();
		}
		System.out.println("work dir : " + file.getAbsolutePath());
	}
	
	public void setBaseUrl(String baseUrl) {
		this.baseUrl = baseUrl;
	}

	/**
	 * 执行返回 Document
	 * 
	 * @return
	 */
	private Document execute() {
		try {
			return Jsoup.parse(new URL(baseUrl), 30000);
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return null;
	}

	/**
	 * 单线程下载
	 */
	public void spiderImgsDownLoad() {

		Document root = execute();
		if (root == null) {
			System.out.println("root is null");
			return;
		}

		int count = 0;
		Element ele = root.getElementById("page-content");
		Elements eles = ele.getElementsByTag("img");
		for (Element img : eles) {
			String src = "";
			if (img.hasAttr("src")) {
				src = img.attr("src");
			} else {
				src = img.attr("data-src");
			}
			String fileName = getFileNameBySuffix(src,count);
			src = urlAddHttpPrefix(src);
			if (src.length() > 0) {
				downLoadFromUrl(src, fileName, downloadDir);
				count++;
			}
		}
	}

	/**
	 * 多线程进行下载
	 */
	public void spiderImgsByMultipleThreadsDownLoad() {

		Document root = execute();
		if (root == null) {
			System.out.println("root is null");
			return;
		}

		int count = 0;
		Element ele = root.getElementById("page-content");
		Elements eles = ele.getElementsByTag("img");
		for (Element img : eles) {
			String src = "";
			if (img.hasAttr("src")) {
				src = img.attr("src");
			} else {
				src = img.attr("data-src");
			}
			String fileName = getFileNameBySuffix(src,count);
			src = urlAddHttpPrefix(src);
			if (src.length() > 0) {
				count++;
				service.submit(new DownLoader(src, downloadDir, fileName));
			}
		}
	}

	/**
	 * 添加HTTP前缀
	 * @param src
	 * @return
	 */
	private String urlAddHttpPrefix(String src) {
		if (src.startsWith("//")) {
			src = "http:" + src;
		}
		return src;
	}
	
	/**
	 * 确定文件后缀
	 * @param src
	 * @param count
	 * @return
	 */
	private String getFileNameBySuffix(String src,int count) {
		String fileName = String.format("%3d", count);
		if (src.toLowerCase().contains("gif")) {
			fileName = fileName.concat(".gif");
		} else if (src.toLowerCase().contains("jpg") || src.toLowerCase().contains("jpeg")) {
			fileName = fileName.concat(".jpg");
		} else if (src.toLowerCase().contains("png")) {
			fileName = fileName.concat(".png");
		} else {
			fileName = fileName.concat(".jpg");
		}
		return fileName;
	}
	
	/**
	 * 下载图片
	 * @param urlStr
	 * @param fileName
	 * @param savePath
	 */
	public static void downLoadFromUrl(String urlStr, String fileName, String savePath) {
		try {
			System.out.println(urlStr);
			URL url = new URL(urlStr);
			HttpURLConnection conn = (HttpURLConnection) url.openConnection();
			// 设置超时间为3秒
			conn.setConnectTimeout(3 * 1000);
			// 防止屏蔽程序抓取而返回403错误
			conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
			// 得到输入流
			InputStream inputStream = conn.getInputStream();
			
			byte[] buffer = new byte[1024];
			int len = 0;
			ByteArrayOutputStream bos = new ByteArrayOutputStream();
			while ((len = inputStream.read(buffer)) != -1) {
				bos.write(buffer, 0, len);
			}
			bos.close();
			 
			// 获取自己数组
			byte[] getData = bos.toByteArray();

			// 文件保存位置
			File saveDir = new File(savePath);
			if (!saveDir.exists()) {
				saveDir.mkdir();
			}
			File file = new File(saveDir + File.separator + fileName);
			FileOutputStream fos = new FileOutputStream(file);
			fos.write(getData);
			if (fos != null) {
				fos.close();
			}
			if (inputStream != null) {
				inputStream.close();
			}
			System.out.println(fileName+ ":" + url + " download success");
		} catch (Exception e) {
			e.printStackTrace();
			System.err.println(urlStr);
		}
	}


	/**
	 * 线程run方法
	 */
	@Override
	public void run() {
		spiderImgsDownLoad();
	}

	public void startThread() {
		Thread thread = new Thread(this);
		thread.start();
	}

	/**
	 * 创建一个内部类来进行多线程下载
	 * @author yuxuan
	 *
	 */
	class DownLoader implements Runnable {

		private String urlStr;
		private String savePath;
		private String fileName;

		public DownLoader(String urlStr, String savePath, String fileName) {
			this.urlStr = urlStr;
			this.savePath = savePath;
			this.fileName = fileName;
		}

		@Override
		public void run() {
			// 调用类的静态方法进行下载
			WxArticle.downLoadFromUrl(urlStr, fileName, savePath);
		}
	}

	public static void main(String[] args) {
		WxArticle wxArt = new WxArticle();
		wxArt.setBaseUrl("https://mp.weixin.qq.com/s/EZNLwWcm6kzD5pLwSlbpXw");
		//调用单线程下载方法
//		wxArt.spiderImgsDownLoad();
		//调用多线程下载方法
		wxArt.spiderImgsByMultipleThreadsDownLoad();
		//关闭线程池
		wxArt.shutDown(); 
	}

}

以上代码用到了Jsoup包,下面是maven的坐标:

<dependency>
	<groupId>org.jsoup</groupId>
	<artifactId>jsoup</artifactId>
	<version>1.12.1</version>
</dependency>

有问题可以在评论区留言,技术问题可以私信我。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值