java小demo：下载http://learning.sohu.com/s2004/7231/s221868027.shtml中所有的内容图片

最新推荐文章于 2021-09-08 18:44:44 发布

weixin_33701564

最新推荐文章于 2021-09-08 18:44:44 发布

阅读量82

点赞数

原文链接：https://my.oschina.net/bysu/blog/882959

版权

为什么80%的码农都做不了架构师？>>>

下面直接贴代码

package download;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


//http://learning.sohu.com/.*\\.shtml.*章.*</a>
/**
 * @author 苏宝伢 E-mail:by.su@qq.com
 * @version 创建时间： 2017年4月14日 下午5:48:11
*/
public class MyGetPicture {

	public static void main(String[] args) {
		
		//由于公司内部对外网限制，需设置代理，访问外网
		setProperties();
		String firstPageSource = getResponseFromUrl("http://learning.sohu.com/s2004/7231/s221868027.shtml");
		/*WebDriver driver = new ChromeDriver();
		//1.打开“主页”
		driver.get("http://learning.sohu.com/s2004/7231/s221868027.shtml");*/
		//2.获取所有章的url
		List<String> list = getUnitPath(firstPageSource);
		for(String str:list){
			System.out.println(list);
			//3.进入某一章，获取该章的所有小节的url
			String pageSource = getResponseFromUrl(str);
			
			//3.通过传入url，获取该小节的源码
			Map<Integer,String> allPageUrl = perUnitAllPage(pageSource);
			new File("D:\\bysu\\downpic\\" + setFileName(pageSource)).mkdir();
			System.out.println(str + "查看异常url");
			for(Map.Entry<Integer, String> perPageUrl:allPageUrl.entrySet()){
				//获取当前页源码,以便匹配出gif格式的超链接
				String perPageSource = getResponseFromUrl(perPageUrl.getValue());
				Matcher m = matchRegex("[a-z]+://[^\\s]*(Img\\d{5,}\\.gif)",perPageSource);
                //本地文件命名
				String fileName = "D:\\bysu\\downpic\\" + setFileName(pageSource) +  "\\" + setFileName(pageSource) + perPageUrl.getKey() + ".gif";
				if(m.find()){
					
					//把某一章的的所有页数图片下载至本地
					downLoadImage(m.group(),fileName);
				}
				System.out.println(setFileName(pageSource) + "--下载完成");
			}
		}
	}
	
	//根据正则返回匹配“集合”
	public static Matcher matchRegex(String regex,String content){
		Pattern p = Pattern.compile(regex);
		Matcher m = p.matcher(content);
		return m;
	}
	
	//获取每一章的url,不过这个可能也会把其他匹配的加进来。后续可以增加含有“章”字关键字，匹配更优的结果
		public static List<String> getUnitPath(String driver){
		
			/**
			 * 1.获取页面源代码
			 * 2.查找源代码中含有url：http://learning.sohu.com/20040903/n221868072.shtml
			 * 4.把获取到的url通过return关键字，return给调用者使用
			 */
			List<String> list = new ArrayList<>();
//			Matcher m = matchRegex("[a-z]+://[^\\s]*(\\.shtml)",driver.getPageSource());
			Matcher m = matchRegex("\\w{1,4}:/[^\\s=]*\\/n\\d{5,}\\.shtml",driver);
			while(m.find()){ 
				list.add(m.group());//获取某一章的url，并添加到list集合中
			}
			return list;
		}
	

		//进入某章后，通过源码，把匹配的值全部添加进集合，各章第一页的url全部添加进去，不过这个可能也会把其他匹配的加进来。
				public static Map<Integer,String> perUnitAllPage(String pageContent){
					Matcher m = matchRegex("\\w{1,4}:/[^\\s=]*\\/n\\d{5,}\\.shtml",pageContent);
					Map<Integer,String> linkMap = new LinkedHashMap<>();
					int count = 1;
					while(m.find()){
						linkMap.put(count++,m.group());
					}
					return linkMap;
				}

	
	//根据获取到的图片url，对图片进行下载到本地
	public static void downLoadImage(String urlPath,String fileName){
		
		InputStream in = null;
		OutputStream out = null;
		try {
			URL url = new URL(urlPath);
			in = url.openStream();
			out = new FileOutputStream(fileName);
			byte[] bt = new byte[8192];
			int readTemp = 0;
			while((readTemp = in.read(bt))!=-1){
				out.write(bt, 0, readTemp);
			}
		} catch (IOException e) {
			e.printStackTrace();
		}finally{
			if(in != null){
				try {
					in.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}else if(out != null){
				try {
					out.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
			System.out.println("下载完成" + urlPath);
		}
	}
	
	//设置文件名
	public static String setFileName(String pageSource){
		Matcher m = matchRegex("<title>.*</title>", pageSource);
		String fileName = "";
		if(m.find()){
			fileName = m.group();
			return fileName.split("<title>|</title>")[1];
		}else{
			return "";
		}
		
	}
	
	//通过url访问地址，然后接受服务器相应回来的内容，存入String中
	public static String getResponseFromUrl(String urlPath){
		String acept = "";
		URL openUrl = null;
		URLConnection conn = null;
		BufferedReader read = null;
		try {
			openUrl = new URL(urlPath);
			conn = openUrl.openConnection();
			read = new BufferedReader(new InputStreamReader(conn.getInputStream(),"ISO-8859-1"));
			String strTemp = "";
			while((strTemp=read.readLine())!=null){
				acept += new String(strTemp.getBytes("ISO-8859-1"),"gb2312");
			}
		} catch (IOException e) {
			System.out.println("从服务器获取源文件失败");
			e.printStackTrace();
		} finally{
			if(read != null){
				try {
					read.close();
				} catch (IOException e) {
					System.out.println("读取源文件出异常~");
					e.printStackTrace();
				}
			}
		}
		System.out.println("---------------------------");
		return acept;
	}
	
	
	//设置代理http
	public static void setProperties(){
		System.getProperties().setProperty("proxySet", "true"); 
        //用的代理服务器  
        System.getProperties().setProperty("http.proxyHost", "111.111.111.111"); 
        //代理端口  
        System.getProperties().setProperty("http.proxyPort", "8080"); 
	}

    //设置代理https
	public static void setProperties(){
		System.getProperties().setProperty("proxySet", "true"); 
        //用的代理服务器  
        System.getProperties().setProperty("https.proxyHost", "111.111.111.111"); 
        //代理端口  
        System.getProperties().setProperty("https.proxyPort", "8080"); 
	}
}

运行后，结果如下，不知道为什么第三章下载不到，有时间再看看了