本程序可下载页面所依赖的CSS/JS,图片等引用,目前不包含下载关联HTML页面这个功能。代码如下
/*
*****************************************************************************
* This software is under the Apache License Version 2.0
* Author: Tao - mail:cn.java.river@gmail.com
* Spreading Your Heart
****************************************************************************
*/
package atao.util.html;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import org.apache.commons.lang.StringUtils;
/**
*
* A Simple HTML downloader which can also download Page resources.
* <br/>
* <b>Note: This Tool won't download related or sub HTML</b>
*
* @author <a href="mailto:cn.java.river@gmail.com">Tao</a>
* @since 1.0
*/
public class HtmlDownloader
{
//URL will be downloaded.
private static String url = "http://pervasive2.morselli.unimo.it/~nicola/courses/IngegneriaDelSoftware/java/J6d_xml.html";
//workspace folder.
private static String workspace = "download";
//sub css and js resources sign
private static String urlSign = "<link href=";
//sub image resources sign
private static String urlSign2 = "src=";
//URL parent.
private static String rootUrl = null;
public static void main (String[] args) throws Exception
{
long start = System.nanoTime ();
setRootUrl ();
URL u = new URL (url);
InputStream is = u.openStream ();
BufferedReader reader = new BufferedReader (new InputStreamReader (is));
File f = createDownloadFile ("download.html");
BufferedWriter writer = new BufferedWriter (new FileWriter (f));
String s;
while ((s = reader.readLine ()) != null)
{
writer.write (s);
writer.newLine ();
if (hasSubUrl (s))
{
downloadChild (getSubUrl (s));
}
}
is.close ();
reader.close ();
writer.close ();
System.out.println ("Download time(s):" + String.format ("%.3f", (double)(System.nanoTime () - start)/ 1000000000.00));
} // end of main
/**
* set root url for the downloading html
*/
private static void setRootUrl ()
{
int pos = url.lastIndexOf ("/");
rootUrl = url.substring (0, pos);
System.out.println ("Root Url is:" + rootUrl);
}
/**
* check if content includes sub resources.
*
* @param text line of html content.
* @return Yes or Not
*/
private static boolean hasSubUrl (String text)
{
if (StringUtils.isNotEmpty (text))
{
if (text.contains (urlSign) || text.contains (urlSign2))
{
return true;
}
return false;
}
else
{
return false;
}
}
/**
* generate sub url from line content.
*/
private static String getSubUrl (String text)
{
int pos = text.indexOf (urlSign);
pos = (pos == -1) ? text.indexOf (urlSign2) : pos;
text = text.substring (pos);
String[] ps = text.split ("\"");
System.out.println ("subUrl is :" + ps[1]);
return ps[1];
}
/**
* download sub resources,<b>Note: don't use Java Character Writers,
* otherwise you can't get pictures correctly.</b>
*
* @param subUrl
*/
private static void downloadChild (String subUrl)
{
if (StringUtils.isNotEmpty (subUrl))
{
if (subUrl.startsWith ("http:"))
{
System.out.println ("subUrl not support yet.");
}
else
{
long start = System.nanoTime ();
try
{
String forUrl = subUrl.replace (" ", "%20");
if (!forUrl.startsWith ("/"))
{
forUrl = "/" + forUrl;
}
URL u = new URL (rootUrl + forUrl);
InputStream reader = u.openStream ();
File f = createDownloadFile (subUrl);
FileOutputStream writer = new FileOutputStream (f);
byte[] buff = new byte[1024];
int size = -1;
while ((size = reader.read (buff)) != -1)
{
writer.write (buff, 0, size);
}
reader.close ();
writer.close ();
}
catch (Exception e)
{
e.printStackTrace ();
}
System.out.println ("Source:" + subUrl +"download time(s):" + String.format ("%.3f", (double)(System.nanoTime () - start)/ 1000000000.00));
}
}
else
{
System.out.println ("subUrl is Empty.");
}
}
/**
* create sub file,create parent folders if necessary.
*
* @param url related path of a url source.
* @return created file.
*/
private static File createDownloadFile (String url)
{
File f = new File (workspace, url);
f.getParentFile ().mkdirs ();
return f;
}
}