多线程线程池的基本使用(基于Jsoup的爬虫)

最新推荐文章于 2022-05-20 00:51:52 发布

zhaoyy0513

最新推荐文章于 2022-05-20 00:51:52 发布

阅读量690

点赞数

本文链接：https://blog.csdn.net/zhaoyy0513/article/details/95960358

版权

首先界面为

（1） controller层代码，主要为了获取前台文本框中用户输入的需要爬取图片的地址和图片保存的路径

@RequestMapping(value = "/mzitu")
	@ResponseBody
	public String crawMzitu(HttpServletRequest request, HttpServletResponse response) throws Exception {
		String outPath = request.getParameter("outPath"); // 存放图片的本地地址
		String webPath = request.getParameter("webPath"); // 要爬取的网站
		int pageCount = getPageCount(webPath); // 获取图片所拥有的总页数
		ExecutorService fixEd = Executors.newFixedThreadPool(8);
		try {
			for (int j = 1; j <= pageCount; j++) {
				fixEd.execute(new MyCrawlRunable(webPath + "/" + j, outPath));
				// crawl(webPath+"/"+j,outPath);
				Thread.sleep(200);
				if (j == pageCount) {
//					Properties prop = System.getProperties();
//					String os = prop.getProperty("os.name");
//					//如果是linux部署环境
//					if (os != null && os.toLowerCase().indexOf("linux") > -1) {
//						Runtime runtime = Runtime.getRuntime();
//						String[] cmdarray = new String[3];
//						cmdarray[1] = "cd /var/local";
//						cmdarray[2] = "zip images.zip images/*";
//						Process exec = runtime.exec(cmdarray);
//						while(exec.waitFor()!=0) {
//							File file = new File("images.zip");
//							FileUtils.copyFile(file,response.getOutputStream());
//							runtime.exec("rm images.zip");
//						}
//					} 
					//如果不是linux直接输出语句
					System.out.println("下载完成！共计" + pageCount + "张图片");
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
			return "访问失效";
		}
		return "存放完成，本次共存取" + pageCount + "张图片";
	}

	private static int getPageCount(String webPath) {
		int pageCount = 0;
		Document document;
		try {
			document = Jsoup.connect(webPath).get();
			Elements pages = document.select("div.pagenavi");
			for (Element element : pages) {
				Elements spans = element.getElementsByTag("span");
				Element element2 = spans.get(spans.size() - 2); // 找到倒数第二个span获取它的数字
				pageCount = Integer.valueOf(element2.text());
			}
			return pageCount;
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return pageCount;
	}

（2）对应的MyCrawlRunable代码为

public class MyCrawlRunable implements Runnable {
	// 图片的路径
	String obtionWebPath;
	// 保存的路径
	String outPath;

	public MyCrawlRunable(String obtionWebPath, String outPath) {
		this.obtionWebPath = obtionWebPath;
		this.outPath = outPath;
	}

	@Override
	public void run() {
		try {
			InputStream inputStream = null;
			BufferedInputStream bis = null;
			FileOutputStream outputStream = null;
			Document document = Jsoup.connect(obtionWebPath).get();
			Elements select = document.select("img[src]");
			for (Element element : select) {
				String picLink = element.attr("src");
				String[] picArr = picLink.split("/");
				String picName = picArr[picArr.length - 1];
				if (picName.indexOf("png") > -1 || picName.length() > 9) {
					// 不要png和名字大于5(.jpg占4个字符)的图片
					break;
				}
				URL imgUrl = new URL(picLink);
				HttpURLConnection connection = (HttpURLConnection) imgUrl.openConnection();
				Config config = new Config();
				config.setHost("i.meizitu.net");
				config.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0");
				config.setAccept("image/webp,*/*");
				config.setAcceptLanguage("zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2");
				config.setReferer(obtionWebPath);
				config.setConnection("keep-alive");
				Map<String, Object> map = resHeaderUtil.getMap(config);
				for (Entry<String, Object> entry : map.entrySet()) {
					connection.setRequestProperty(entry.getKey(), (String) entry.getValue());
				}
				// 获取输入流
				try {
					inputStream = connection.getInputStream();
					// 将输入信息放入缓冲流提升读写速度
					bis = new BufferedInputStream(inputStream);
					byte[] bs = new byte[1024];
					// 生成文件
					outputStream = new FileOutputStream(outPath + "/" + picName);
					int size = 0;
					while ((size = bis.read(bs)) != -1) {
						outputStream.write(bs, 0, size);
					}
					// 刷新文件流
					outputStream.flush();
					System.out.println(picName + "下载完成，下载路径为:" + outPath + "/" + picName);
				} catch (Exception e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				} finally {
					if (outputStream != null) {
						outputStream.close();
					}
					if (bis != null) {
						bis.close();
					}
					if (inputStream != null) {
						inputStream.close();
					}
				}
			}
		} catch (Exception e) {
			// TODO: handle exception
		}

	}

}