下面直接贴代码
package download;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
//http://learning.sohu.com/.*\\.shtml.*章.*</a>
/**
* @author 苏宝伢 E-mail:by.su@qq.com
* @version 创建时间: 2017年4月14日 下午5:48:11
*/
public class MyGetPicture {
public static void main(String[] args) {
//由于公司内部对外网限制,需设置代理,访问外网
setProperties();
String firstPageSource = getResponseFromUrl("http://learning.sohu.com/s2004/7231/s221868027.shtml");
/*WebDriver driver = new ChromeDriver();
//1.打开“主页”
driver.get("http://learning.sohu.com/s2004/7231/s221868027.shtml");*/
//2.获取所有章的url
List<String> list = getUnitPath(firstPageSource);
for(String str:list){
System.out.println(list);
//3.进入某一章,获取该章的所有小节的url
String pageSource = getResponseFromUrl(str);
//3.通过传入url,获取该小节的源码
Map<Integer,String> allPageUrl = perUnitAllPage(pageSource);
new File("D:\\bysu\\downpic\\" + setFileName(pageSource)).mkdir();
System.out.println(str + "查看异常url");
for(Map.Entry<Integer, String> perPageUrl:allPageUrl.entrySet()){
//获取当前页源码,以便匹配出gif格式的超链接
String perPageSource = getResponseFromUrl(perPageUrl.getValue());
Matcher m = matchRegex("[a-z]+://[^\\s]*(Img\\d{5,}\\.gif)",perPageSource);
//本地文件命名
String fileName = "D:\\bysu\\downpic\\" + setFileName(pageSource) + "\\" + setFileName(pageSource) + perPageUrl.getKey() + ".gif";
if(m.find()){
//把某一章的的所有页数图片下载至本地
downLoadImage(m.group(),fileName);
}
System.out.println(setFileName(pageSource) + "--下载完成");
}
}
}
//根据正则返回匹配“集合”
public static Matcher matchRegex(String regex,String content){
Pattern p = Pattern.compile(regex);
Matcher m = p.matcher(content);
return m;
}
//获取每一章的url,不过这个可能也会把其他匹配的加进来。后续可以增加含有“章”字关键字,匹配更优的结果
public static List<String> getUnitPath(String driver){
/**
* 1.获取页面源代码
* 2.查找源代码中含有url:http://learning.sohu.com/20040903/n221868072.shtml
* 4.把获取到的url通过return关键字,return给调用者使用
*/
List<String> list = new ArrayList<>();
// Matcher m = matchRegex("[a-z]+://[^\\s]*(\\.shtml)",driver.getPageSource());
Matcher m = matchRegex("\\w{1,4}:/[^\\s=]*\\/n\\d{5,}\\.shtml",driver);
while(m.find()){
list.add(m.group());//获取某一章的url,并添加到list集合中
}
return list;
}
//进入某章后,通过源码,把匹配的值全部添加进集合,各章第一页的url全部添加进去,不过这个可能也会把其他匹配的加进来。
public static Map<Integer,String> perUnitAllPage(String pageContent){
Matcher m = matchRegex("\\w{1,4}:/[^\\s=]*\\/n\\d{5,}\\.shtml",pageContent);
Map<Integer,String> linkMap = new LinkedHashMap<>();
int count = 1;
while(m.find()){
linkMap.put(count++,m.group());
}
return linkMap;
}
//根据获取到的图片url,对图片进行下载到本地
public static void downLoadImage(String urlPath,String fileName){
InputStream in = null;
OutputStream out = null;
try {
URL url = new URL(urlPath);
in = url.openStream();
out = new FileOutputStream(fileName);
byte[] bt = new byte[8192];
int readTemp = 0;
while((readTemp = in.read(bt))!=-1){
out.write(bt, 0, readTemp);
}
} catch (IOException e) {
e.printStackTrace();
}finally{
if(in != null){
try {
in.close();
} catch (IOException e) {
e.printStackTrace();
}
}else if(out != null){
try {
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
System.out.println("下载完成" + urlPath);
}
}
//设置文件名
public static String setFileName(String pageSource){
Matcher m = matchRegex("<title>.*</title>", pageSource);
String fileName = "";
if(m.find()){
fileName = m.group();
return fileName.split("<title>|</title>")[1];
}else{
return "";
}
}
//通过url访问地址,然后接受服务器相应回来的内容,存入String中
public static String getResponseFromUrl(String urlPath){
String acept = "";
URL openUrl = null;
URLConnection conn = null;
BufferedReader read = null;
try {
openUrl = new URL(urlPath);
conn = openUrl.openConnection();
read = new BufferedReader(new InputStreamReader(conn.getInputStream(),"ISO-8859-1"));
String strTemp = "";
while((strTemp=read.readLine())!=null){
acept += new String(strTemp.getBytes("ISO-8859-1"),"gb2312");
}
} catch (IOException e) {
System.out.println("从服务器获取源文件失败");
e.printStackTrace();
} finally{
if(read != null){
try {
read.close();
} catch (IOException e) {
System.out.println("读取源文件出异常~");
e.printStackTrace();
}
}
}
System.out.println("---------------------------");
return acept;
}
//设置代理http
public static void setProperties(){
System.getProperties().setProperty("proxySet", "true");
//用的代理服务器
System.getProperties().setProperty("http.proxyHost", "111.111.111.111");
//代理端口
System.getProperties().setProperty("http.proxyPort", "8080");
}
//设置代理https
public static void setProperties(){
System.getProperties().setProperty("proxySet", "true");
//用的代理服务器
System.getProperties().setProperty("https.proxyHost", "111.111.111.111");
//代理端口
System.getProperties().setProperty("https.proxyPort", "8080");
}
}
运行后,结果如下,不知道为什么第三章下载不到,有时间再看看了