代码如下:
1
/** */
/** The regex for search link with the tag "a" */
2 private final String A_REGEX = "<a.*?/a>";
3 /** */ /** The regex for search url with the tag "href" */
4 private final String HREF_REGEX = "href=\".*?\"";
5 /** */ /** The pattern for linke with the tag "a" */
6 private final Pattern A_PATTERN = Pattern.compile(A_REGEX);
7 /** */ /** The pattern for url with the tag "href" */
8 private final Pattern HREF_PATTERN = Pattern.compile(HREF_REGEX);
9 /** */ /**
10 * Get url address from the url and the content of the url
11 * @param url the url need to be get links
12 * @param content the content of the given url
13 * @return a list with the url address of the links
14 */
15 public List<String> getLinkList( URL url, String content )
16 {
17 List<String> linkList = new LinkedList<String>();
18 final Matcher a_matcher = A_PATTERN.matcher(content);
19 while (a_matcher.find())
20 {
21 //JUST FOR TEST!
22// System.out.println(a_matcher.group());
23 //get url address
24 final Matcher myurl = HREF_PATTERN.matcher(a_matcher.group());
25 while (myurl.find())
26 {
27 String urlAddress = myurl.group().replaceAll("href=|>|\"|\"", "");
28 if( urlAddress.startsWith("http") )
29 {
30 linkList.add(urlAddress);
31 }
32 else if( urlAddress.startsWith("/") || urlAddress.startsWith("\\") )
33 {
34 linkList.add(url.getPath()+urlAddress);
35 }
36 else
37 {
38 String fullUrl = url.toString();
39 //the length of the url without the current page
40 int lastSlash = fullUrl.lastIndexOf("/") + 1;
41 linkList.add(fullUrl.substring(0,lastSlash) + urlAddress);
42 }
43 }
44 }
45 return linkList;
46 }
2 private final String A_REGEX = "<a.*?/a>";
3 /** */ /** The regex for search url with the tag "href" */
4 private final String HREF_REGEX = "href=\".*?\"";
5 /** */ /** The pattern for linke with the tag "a" */
6 private final Pattern A_PATTERN = Pattern.compile(A_REGEX);
7 /** */ /** The pattern for url with the tag "href" */
8 private final Pattern HREF_PATTERN = Pattern.compile(HREF_REGEX);
9 /** */ /**
10 * Get url address from the url and the content of the url
11 * @param url the url need to be get links
12 * @param content the content of the given url
13 * @return a list with the url address of the links
14 */
15 public List<String> getLinkList( URL url, String content )
16 {
17 List<String> linkList = new LinkedList<String>();
18 final Matcher a_matcher = A_PATTERN.matcher(content);
19 while (a_matcher.find())
20 {
21 //JUST FOR TEST!
22// System.out.println(a_matcher.group());
23 //get url address
24 final Matcher myurl = HREF_PATTERN.matcher(a_matcher.group());
25 while (myurl.find())
26 {
27 String urlAddress = myurl.group().replaceAll("href=|>|\"|\"", "");
28 if( urlAddress.startsWith("http") )
29 {
30 linkList.add(urlAddress);
31 }
32 else if( urlAddress.startsWith("/") || urlAddress.startsWith("\\") )
33 {
34 linkList.add(url.getPath()+urlAddress);
35 }
36 else
37 {
38 String fullUrl = url.toString();
39 //the length of the url without the current page
40 int lastSlash = fullUrl.lastIndexOf("/") + 1;
41 linkList.add(fullUrl.substring(0,lastSlash) + urlAddress);
42 }
43 }
44 }
45 return linkList;
46 }