HTMLParser用于解析html文件同时可以获取URL。
代码如下:
package cn.com.vnvtrip.apache.luence.custom;
/**
*
* @author longgangbai
*
*/
public interface Constants {
public static final String HTTP_URL_TARGET_TAG = "target";
public static final String HTTP_URL_ONCLICK_TAG = "onclick";
public static final String HTTP_UBN_TAG = "bnu";
public static final String HTTP_TAG = "http";
}
package cn.com.vnvtrip.apache.luence.custom;
import java.util.ArrayList;
import java.util.Collection;
import org.htmlparser.Parser;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
/**
* 根据一個根网站抓取相应URL
* @author longgangbai
*
*/
public class HTMLParser {
/**
* 搜索网站的检索URl
*
* @param url
* @param pageEncoding
* @return
* @throws ParserException
*/
public static Collection<String> getWebSiteUrls(String url,
String pageEncoding) throws ParserException {
Collection<String> urls = new ArrayList<String>();
Parser parser = new Parser(url);
parser.setEncoding(pageEncoding);
NodeList nodeList = parser.parse(new AndFilter(new HasAttributeFilter(
Constants.HTTP_URL_TARGET_TAG), new HasAttributeFilter(
Constants.HTTP_URL_ONCLICK_TAG)));
if (nodeList != null && nodeList.size() > 0) {
for (int i = 0; i < nodeList.size(); i++) {
String urlLink = ((LinkTag) nodeList.elementAt(i))
.extractLink();
String LinkName = ((LinkTag) nodeList.elementAt(i))
.getLinkText();
if (urlLink.indexOf(Constants.HTTP_UBN_TAG) == 0
|| urlLink.indexOf(Constants.HTTP_TAG) == 0) {
urls.add(LinkName);
} else {
urls.add(urlLink);
}
}
}
return urls;
}
}