使用了jsoup这一第三方包,解析HTML效果拔群,同时使用了缓冲区来进行输入输出,提升效率
下载器类
package just4test2;
import java.io.*;
import java.net.*;
import org.jsoup.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
public class Downloader {
private String base,path;
public Downloader(String abase,String apath) {
base=abase;
path=apath;
}
public void download(String src) throws Exception {
File fp = new File (path);
if (!fp.exists())
fp.mkdirs();
int pos = src.lastIndexOf('/');
String filename = src.substring(pos);
URL url = new URL(src);
InputStream cin = url.openStream();//开启连接,同时返回输入流对象
FileOutputStream cout = new FileOutputStream (path+filename);//得到文件输出流对象
int size=0;
byte[] buffer = new byte[1024];//建立缓冲区,存放1kb的数据,再一起写入文件,提高效率
while ((size=cin.read(buffer))!=-1)
cout.write(buffer,0,size);//从头开始写入
cin.close();
cout.close();
}
public void getPic() throws Exception{
Connection con = Jsoup.connect(base);//用jsoup获取连接
Document doc = con.get();//得到document对象
// 查找所有img标签
Elements imgs = doc.getElementsByTag("img");//根据img标签抓元素,得到一个element元素集elements,很形象
int j=1;
for (Element x:imgs) {
String imgSrc = x.attr("abs:src");//这步不懂
imgSrc = imgSrc.replaceAll("\\s","");//把空白符替换掉
System.out.printf("正在下载第%d个文件",j++);
System.out.print(",地址:");
System.out.println(imgSrc);
download(imgSrc);
}
}
}
主类
package just4test2;
import java.util.*;
public class Main {
public static void main(String[] args) {
try {
Scanner cin = new Scanner (System.in);
System.out.println("输入要抓取的网页");
String base = cin.next();
System.out.println("输入存放路径");
String path = cin.next();
cin.close();
Downloader down = new Downloader(base,path);
down.getPic();
System.out.println("下载成功");
}
catch (Exception e) {
e.printStackTrace();
System.out.println("GG");
}
}
}
稳定性大幅提升,当然还不够完善,比如可以设置响应时间,多线程(这个估计要很久才会实现)遇到错误跳过等等,目前水平到这里差不多了,接下来补习一下前端知识,对jsoup能有个透彻点的了解。