java深度图片爬虫

最新推荐文章于 2022-10-19 19:57:14 发布

chouminxi6079

最新推荐文章于 2022-10-19 19:57:14 发布

阅读量155

点赞数

文章标签：爬虫 java

原文链接：https://my.oschina.net/qfwy/blog/751946

版权

PicsDownload主类程序：

 public class PicsDownload {
        static String path;//定义文件存储路径的变量
	public static void main(String[] args) {
	    path=URLFileDownloadUtils.makeFileInMaxSpace();//最大内存可用空间的路径，文件已经建好
	    String webAddress=null;
	    Set<String> webSet=new HashSet<String>();
	    Scanner input=new Scanner(System.in);
	    System.out.println("请输入网址：");
	    webAddress=input.nextLine();
            webSet.add(webAddress);
	    patu(webSet);
        }
        public static void mainDownload(Set<String> imgSet) {
	        String midle="meinv";
		String temp;
		if(count%30==0){
			path+=File.separator+midle;//每层建30个文件夹后进入下一层文件夹以此类推
		}
		temp=path;
		temp+=count;
		System.out.println(temp);
		ExecutorService newSingleThreadExecutor = Executors.newSingleThreadExecutor();//创建线程池
		for (String imgSrc : imgSet) {//每个图片源地址（每个地址对应一个图片）集合创建一个文件夹存放图片
			File file=new File(temp);
			if(!file.exists()){
				file.mkdirs();
			}
			String filename=URLFileDownloadUtils.getUUID()+imgSrc.substring(imgSrc.lastIndexOf("."));//获取src文件后缀
			String filepath=temp+File.separator+filename;//以原有后缀来定义存储文件的后缀名
			newSingleThreadExecutor.execute(new ThreadDownloadTask(imgSrc, filepath));//每个源创建一个线程下载
//			URLFileDownloadUtils.download(imgSrc,filepath);//不使用线程下载（速度明显慢了很多）
		}
		newSingleThreadExecutor.shutdown();//关闭线程池
	}
	static int count;//用来计数下载次数
	public static void patu(Set<String> webSet) {
		/*
		 * 根据网站set集合来爬资源
		 */
		Map<String, Set<String>> fileMap=new HashMap<String, Set<String>>();//传map集合文件类型作为键，地址List作为值，分别存储文件源地址，html页面地址
		Set<String> htmlSet = null;
		Set<String> imgSet = null;
		String[] htmls=URLFileDownloadUtils.getHtml(webSet);//通过网址获得页面html页面字符源码
		for (int i = 0; i < htmls.length; i++) {
			fileMap=URLFileDownloadUtils.getSrcMap(htmls[i],fileMap);//通过html爬虫图片源地址或者html地址，保存到map中并返回
			htmlSet=(Set<String>) fileMap.get("html");
			imgSet=(Set<String>) fileMap.get("img");
			htmlSet.addAll((Set<String>) fileMap.get("html"));//取出两个键对应的单列集合
			imgSet.addAll((Set<String>) fileMap.get("img"));
		}
		mainDownload(imgSet);
		count++;
		System.out.println("第"+count+"次完成");
		patu(htmlSet);//递归继续搜索html网页集合中的图片地址
	}
}

URLFileDownloadUtils 工具类：

public class URLFileDownloadUtils {
	final static String LINE_SEPARATOR=System.getProperty("line.separator");
	public static String[] getHtml(Set<String> htmlSet){
		/*
		 * 获取集合中所有页面字符串源码
		 */
		String[] htmls=new String[htmlSet.size()];
		int i=0;
		ExecutorService newSingleThreadExecutor = Executors.newSingleThreadExecutor();
		for (Iterator<String> it = htmlSet.iterator(); it.hasNext();i++) {
			/*
			 * 可以弄一个有返回值的线程
			 */
			Future<String> future=newSingleThreadExecutor.submit(new BackHtmlThread(it.next()));
			try {
				htmls[i]=future.get();
			} catch (InterruptedException | ExecutionException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
//			htmls[i]=getHtml(it.next());
		}
		newSingleThreadExecutor.shutdown();
		return htmls;
	}
	
	public static String getHtml(String webAddress){//获取页面字符串源码
		StringBuilder sb=new StringBuilder();
		try {
			InputStream in=new URL(webAddress).openStream();
			BufferedReader br=new BufferedReader(new InputStreamReader(in,"utf-8"));
			String len=null;
			while((len=br.readLine())!=null){
				sb.append(len);
				sb.append(LINE_SEPARATOR);//换行符
			}
			in.close();
			br.close();
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		return sb.toString();
	}
	
	
	public  static Map<String, Set<String>> getSrcMap(String html,Map<String, Set<String>> fileMap){
		/*
		 * 获取符合正则的html网页中的所有类容并存储到fileMap中
		 * 这里只获取两类，一个html网页，另一个是图片，分别存入两个set集合，然后用html,img做key,其集合做value
		 * 存入map中并返回
		 */
		Set<String> imgSet=new HashSet<String>();//防止重复
		Set<String> htmlSet=new HashSet<String>();
		Pattern p1=Pattern.compile("http[^\"^>]*\\.(JPG|BMP|PNG|HTML|jpg|bmp|png|html)");//爬虫
		Matcher m1=p1.matcher(html);
		while(m1.find()){//找图片
			String s=m1.group();
			System.out.println(s);
			if(s.endsWith("html")){
				htmlSet.add(s);
			}else{
				imgSet.add(m1.group());
			}
			fileMap.put("html", htmlSet);
			fileMap.put("img", imgSet);
		}
		System.out.println(fileMap);
		return fileMap;
	}
	public static void download(String imgSrc,String path) throws IOException{//下载网址，与存储路径
		URL u=new URL(imgSrc);//获取网络文件流
		HttpURLConnection uc=(HttpURLConnection)u.openConnection();
		uc.setRequestProperty("User-Agent","Mozilla/5.0 (Windows NT 10.0; WOW64; rv:47.0) Gecko/20100101 Firefox/47.0");
		InputStream in=uc.getInputStream();//获取流
		BufferedInputStream bis=new BufferedInputStream(in);//加缓冲
		BufferedOutputStream bos=new BufferedOutputStream(new FileOutputStream(new File(path)));
		byte [] b=new byte[8096];
		int len=0;
		while((len=bis.read(b))!=-1){
			bos.write(b,0,len);
			bos.flush();
		}
		in.close();
		bos.close();
		bis.close();
	}
	public static String getUUID(){//获取随机uuid
		UUID u=UUID.randomUUID();
		return u.toString().replace("-", "");
	}
	public static String makeFileInMaxSpace() {
		/*
		 * 获取最大可用空间盘符，并在盘符下创建pics文件夹
		 */
		long max=0;
		String rootpath=null;
		File[] listFiles = File.listRoots();
			for (File file2 : listFiles) {
				if(max<file2.getUsableSpace()){
					max=file2.getUsableSpace();
					rootpath=file2.getAbsolutePath();
//					System.out.println(file2.getAbsolutePath()+"可用空间:"+file2.getUsableSpace());
				}
			}
		System.out.println();
		File file=new File(rootpath+"pics");
		if(!file.exists()){
			file.mkdirs();
		System.out.println("注意了！！！！！！！！！！！！！！！！");
		System.out.println("创建了文件夹："+rootpath+"pics");
		}
//		file.delete();
		return file.getAbsolutePath();
	}
}

ThreadDownloadTask 线程类：

public class ThreadDownloadTask implements Runnable {
	String imgSrc;
	String path;
	ThreadDownloadTask(String imgSrc,String path){
		this.imgSrc=imgSrc;
		this.path=path;
	}
	@Override
	public void run() {
		// TODO Auto-generated method stub
		try {
			System.out.println("当前下载文件的线程为："+Thread.currentThread().getName());
			URLFileDownloadUtils.download(imgSrc,path);//图片源地址，存储路径
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
}

BackHtmlThread 线程类：

public class BackHtmlThread implements Callable<String> {//实现callable接口，此线程有返回值，用Future类对象来接受
	String webAddress=null;
	BackHtmlThread(String webAddress){
		this.webAddress=webAddress;
	}
	@Override
	public String call() throws Exception {
		System.out.println("当前获取html文件的线程为："+Thread.currentThread().getName());
		String html=URLFileDownloadUtils.getHtml(webAddress);
		return html;
	}

}

转载于:https://my.oschina.net/qfwy/blog/751946

chouminxi6079

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
java深度图片爬虫

PicsDownload主类程序： public class PicsDownload { static String path;//定义文件存储路径的变量 public static void main(String[] args) { path=URLFile...
复制链接

扫一扫