java采集网页图片

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Iterator;
import java.util.Random;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class Capture {

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		String url = "http://www.jiukuaiyou.com";
		ArrayList<String> list = getAllImages(url);
		if(list.size()>0){
			long starttime = System.currentTimeMillis();
			for(String str : list){
				try {
					URL imgUrl = new URL(str);
					URLConnection conn = imgUrl.openConnection();
					conn.setConnectTimeout(1000);
					InputStream input = conn.getInputStream();
					byte[] b = new byte[1042];
					int len = 0;
					String baseDir = "F:\\imgs";
					File f = new File(baseDir);
					if(!f.isDirectory()){
						f.mkdir();
					}
					
					String  filename = new Long(System.currentTimeMillis()).toString() + ".jpg";
					FileOutputStream out = new FileOutputStream(new File(baseDir + "\\" + filename));
					while((len=input.read(b))!=-1){
						out.write(b, 0, len);
					}
					out.close();
					input.close();
				} catch (MalformedURLException e1) {
					 e1.printStackTrace();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			long endtime = System.currentTimeMillis();
			System.out.println((float)(endtime-starttime)/1000);
		}
	}
	
	
	
	
	public static ArrayList<String> getAllImages(String url){
		String content = getContent(url);
		ArrayList<String> list = null;
		if(content.length()>0){
			list = new ArrayList<String>();
			 ArrayList<String> preifxList = getImgHref(content);
			if(preifxList.size()>0){
				Iterator<String> it = preifxList.iterator();
				while(it.hasNext()){
					String str = it.next();
					list.add(str.replace("_290x190.jpg", ""));
				}
			} 
		}
		
		return list;
		
	}
	public static  String getContent(String str){
		URL url = null;
		BufferedReader r=null ;
		String tmp = null;
		StringBuffer sb = new StringBuffer();
		try {
			url = new URL(str);
			r = new BufferedReader(new InputStreamReader(url.openStream(),"utf-8"));
			while((tmp=r.readLine())!=null){
				sb.append(tmp);
			}
			r.close();
		 
		} catch (MalformedURLException e) {
			 e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		}
		
		return sb.toString();
	}
	
	public static ArrayList<String> getImgHref(String content){
		int maxPage = getMaxPage(content); //匹配最大页数
		ArrayList<String> list = null;
		if(maxPage >0){
			list = new ArrayList<String>();
			for(int i=1; i<=maxPage;i++){
				String baseUrl = "http://www.jiukuaiyou.com/jiu/all/whole/"+i;
				String str = getContent(baseUrl);
				if(null!=str && str.length()>0){
					String reg = "data-original=\\\"(.*?)\\\"";
					Pattern p = Pattern.compile(reg);
					Matcher m = p.matcher(content);
					if(m.find()){
						while(m.find()){
							list.add(m.group(1));
						}
					}else{
						System.out.println("匹配不上");
					}
				}
			}
		}
		return list;
	}
	
	public static int getMaxPage(String content){
		int maxPage = 0;
		ArrayList<Integer> list = null;
		String regPage = "<a href=\\\"/jiu/all/whole/(\\d+)\\\"";
		Pattern p = Pattern.compile(regPage);
		Matcher m = p.matcher(content);
		if(m.find()){
			list = new ArrayList<Integer>();
			while(m.find()){
				list.add(new Integer(m.group(1)));
			}
			if(list.size()>0){
				for(Integer page:list){
					maxPage = page > maxPage ? page : maxPage; 
				}
			}
			
		}else{
			System.out.println("没有匹配到");
		}
		return maxPage;
	}
	
	public static void test(Object s){
		System.out.println(s.toString());
	}
	
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值