多线程线程池的基本使用(基于Jsoup的爬虫)

首先界面为

(1) controller层代码,主要为了获取前台文本框中用户输入的需要爬取图片的地址和图片保存的路径 

@RequestMapping(value = "/mzitu")
	@ResponseBody
	public String crawMzitu(HttpServletRequest request, HttpServletResponse response) throws Exception {
		String outPath = request.getParameter("outPath"); // 存放图片的本地地址
		String webPath = request.getParameter("webPath"); // 要爬取的网站
		int pageCount = getPageCount(webPath); // 获取图片所拥有的总页数
		ExecutorService fixEd = Executors.newFixedThreadPool(8);
		try {
			for (int j = 1; j <= pageCount; j++) {
				fixEd.execute(new MyCrawlRunable(webPath + "/" + j, outPath));
				// crawl(webPath+"/"+j,outPath);
				Thread.sleep(200);
				if (j == pageCount) {
//					Properties prop = System.getProperties();
//					String os = prop.getProperty("os.name");
//					//如果是linux部署环境
//					if (os != null && os.toLowerCase().indexOf("linux") > -1) {
//						Runtime runtime = Runtime.getRuntime();
//						String[] cmdarray = new String[3];
//						cmdarray[1] = "cd /var/local";
//						cmdarray[2] = "zip images.zip images/*";
//						Process exec = runtime.exec(cmdarray);
//						while(exec.waitFor()!=0) {
//							File file = new File("images.zip");
//							FileUtils.copyFile(file,response.getOutputStream());
//							runtime.exec("rm images.zip");
//						}
//					} 
					//如果不是linux直接输出语句
					System.out.println("下载完成!共计" + pageCount + "张图片");
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
			return "访问失效";
		}
		return "存放完成,本次共存取" + pageCount + "张图片";
	}

	private static int getPageCount(String webPath) {
		int pageCount = 0;
		Document document;
		try {
			document = Jsoup.connect(webPath).get();
			Elements pages = document.select("div.pagenavi");
			for (Element element : pages) {
				Elements spans = element.getElementsByTag("span");
				Element element2 = spans.get(spans.size() - 2); // 找到倒数第二个span获取它的数字
				pageCount = Integer.valueOf(element2.text());
			}
			return pageCount;
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return pageCount;
	}

(2)对应的MyCrawlRunable代码为

public class MyCrawlRunable implements Runnable {
	// 图片的路径
	String obtionWebPath;
	// 保存的路径
	String outPath;

	public MyCrawlRunable(String obtionWebPath, String outPath) {
		this.obtionWebPath = obtionWebPath;
		this.outPath = outPath;
	}

	@Override
	public void run() {
		try {
			InputStream inputStream = null;
			BufferedInputStream bis = null;
			FileOutputStream outputStream = null;
			Document document = Jsoup.connect(obtionWebPath).get();
			Elements select = document.select("img[src]");
			for (Element element : select) {
				String picLink = element.attr("src");
				String[] picArr = picLink.split("/");
				String picName = picArr[picArr.length - 1];
				if (picName.indexOf("png") > -1 || picName.length() > 9) {
					// 不要png和名字大于5(.jpg占4个字符)的图片
					break;
				}
				URL imgUrl = new URL(picLink);
				HttpURLConnection connection = (HttpURLConnection) imgUrl.openConnection();
				Config config = new Config();
				config.setHost("i.meizitu.net");
				config.setUserAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0");
				config.setAccept("image/webp,*/*");
				config.setAcceptLanguage("zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2");
				config.setReferer(obtionWebPath);
				config.setConnection("keep-alive");
				Map<String, Object> map = resHeaderUtil.getMap(config);
				for (Entry<String, Object> entry : map.entrySet()) {
					connection.setRequestProperty(entry.getKey(), (String) entry.getValue());
				}
				// 获取输入流
				try {
					inputStream = connection.getInputStream();
					// 将输入信息放入缓冲流提升读写速度
					bis = new BufferedInputStream(inputStream);
					byte[] bs = new byte[1024];
					// 生成文件
					outputStream = new FileOutputStream(outPath + "/" + picName);
					int size = 0;
					while ((size = bis.read(bs)) != -1) {
						outputStream.write(bs, 0, size);
					}
					// 刷新文件流
					outputStream.flush();
					System.out.println(picName + "下载完成,下载路径为:" + outPath + "/" + picName);
				} catch (Exception e) {
					// TODO Auto-generated catch block
					e.printStackTrace();
				} finally {
					if (outputStream != null) {
						outputStream.close();
					}
					if (bis != null) {
						bis.close();
					}
					if (inputStream != null) {
						inputStream.close();
					}
				}
			}
		} catch (Exception e) {
			// TODO: handle exception
		}

	}

}

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值