1: import java.io.IOException;
2: import java.util.HashSet;
3: import java.util.Set;
4: import java.util.regex.Matcher;
5: import java.util.regex.Pattern;
6:
7: import org.jsoup.Jsoup;
8: import org.jsoup.nodes.Document;
9: import org.jsoup.nodes.Element;
10: import org.jsoup.select.Elements;
11:
12: publicclass MainClass {
13:
14: privatestatic Set<String> urlSet = new HashSet<String>();
15: /**
16: * http:
17: * https:
18: */
19: privatestatic Pattern p = Pattern
20: .compile(
21: "^(((http|https)://" +
22: "(www.|([1-9]|[1-9]\\d|1\\d{2}|2[0-1]\\d|25[0-5])" +
23: "(\\.(\\d|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])){3}:[0-9]+/)?)" +
24: "{1}.+){1}quot;",
25: Pattern.CASE_INSENSITIVE);
26:
27: publicstaticvoid main(String[] args) {
28: String baseUrl = "http://www.sina.com";
29: spiderInternet(baseUrl, "");
30: }
31:
32: privatestaticvoid spiderInternet(String baseUrl, String exUrl) {
33: if (baseUrl.endsWith("/") && exUrl.startsWith("/")) {
34: baseUrl = baseUrl.substring(0, baseUrl.length() - 1);
35: }
36: String new_url = baseUrl + exUrl;
37: if (urlSet.contains(new_url)) {
38: return;
39: }
40: System.out.println(new_url);
41: try {
42: Document doc = Jsoup.connect(new_url).get();
43: urlSet.add(new_url);
44: Elements links = doc.select("a[href]");
45: for (Element link : links) {
46: String linkHref = link.attr("href");
47: if (linkHref.equals("#")) {
48: return;
49: }
50: Matcher matcher = p.matcher(linkHref);
51: if (matcher.matches()) {
52: spiderInternet(linkHref, "");
53: } else {
54: spiderInternet(baseUrl, linkHref);
55: }
56: }
57: } catch (IOException e) {
58: e.printStackTrace();
59: }
60: }
61:
62: }
63:
测试啊,仅仅是测试.....
转载于:https://blog.51cto.com/drh0534/1333628