htmlpaser打造个性化的爬虫程序 第一天
/**
* A method to get all the Links that follow the given pattern in the page .
* @param url
* @param pattern
* @return Links list
* @author hym
*
*/
public List extractUrls(String url,String pattern)
{
List list = new ArrayList();
System.out.println("extractURL method here!");
try{
Parser parser = new Parser();
parser.setURL(url);
NodeFilter filter = new LinkStringFilter(pattern,true);
NodeList nlist=parser.extractAllNodesThatMatch(filter);
for(int i = 0 ; i <nlist.size(); i++)
{
Node n = nlist.elementAt(i);
if (n instanceof LinkTag) {
LinkTag link = (LinkTag)n;
//System.out.println(link.getLink());
list.add(link.getLink());
}
}
}catch(ParserException e)
{
e.printStackTrace();
}
return list;
}