1: import java.io.IOException;
  2: import java.util.HashSet;
  3: import java.util.Set;
  4: import java.util.regex.Matcher;
  5: import java.util.regex.Pattern;
  6: 
  7: import org.jsoup.Jsoup;
  8: import org.jsoup.nodes.Document;
  9: import org.jsoup.nodes.Element;
 10: import org.jsoup.select.Elements;
 11: 
 12: publicclass MainClass {
 13: 
 14:   privatestatic Set<String> urlSet = new HashSet<String>();
 15:   /**
 16:    * http:
 17:    * https:
 18:    */
 19:   privatestatic Pattern p = Pattern
 20:       .compile(
 21:           "^(((http|https)://" +
 22:           "(www.|([1-9]|[1-9]\\d|1\\d{2}|2[0-1]\\d|25[0-5])" +
 23:           "(\\.(\\d|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])){3}:[0-9]+/)?)" +
 24:           "{1}.+){1}quot;",
 25:           Pattern.CASE_INSENSITIVE);
 26: 
 27:   publicstaticvoid main(String[] args) {
 28:     String baseUrl = "http://www.sina.com";
 29:     spiderInternet(baseUrl, "");
 30:   }
 31: 
 32:   privatestaticvoid spiderInternet(String baseUrl, String exUrl) {
 33:     if (baseUrl.endsWith("/") && exUrl.startsWith("/")) {
 34:       baseUrl = baseUrl.substring(0, baseUrl.length() - 1);
 35:     }
 36:     String new_url = baseUrl + exUrl;
 37:     if (urlSet.contains(new_url)) {
 38:       return;
 39:     }
 40:     System.out.println(new_url);
 41:     try {
 42:       Document doc = Jsoup.connect(new_url).get();
 43:       urlSet.add(new_url);
 44:       Elements links = doc.select("a[href]");
 45:       for (Element link : links) {
 46:         String linkHref = link.attr("href");
 47:         if (linkHref.equals("#")) {
 48:           return;
 49:         }
 50:         Matcher matcher = p.matcher(linkHref);
 51:         if (matcher.matches()) {
 52:           spiderInternet(linkHref, "");
 53:         } else {
 54:           spiderInternet(baseUrl, linkHref);
 55:         }
 56:       }
 57:     } catch (IOException e) {
 58:       e.printStackTrace();
 59:     }
 60:   }
 61: 
 62: }
 63: 

测试啊,仅仅是测试.....