Parser parser = new Parser("http://www.downv.com/Windows/download-IgCodec-10400398.htm");
//链接过滤器
Parser linkParser = new Parser();
//属性过滤器
HasAttributeFilter tagFilter = new HasAttributeFilter("class","info");
HasAttributeFilter downloadLinkFilter = new HasAttributeFilter("class","download1");
HasAttributeFilter descFilter = new HasAttributeFilter("class","block2");
//链接过滤器
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
//通过属性过滤器获得内容list
NodeList downloadTag = parser.extractAllNodesThatMatch(downloadLinkFilter);
if(!CommonUtil.isEmpty(downloadTag)){
//获得通过过滤后的tag
Node tag = downloadTag.elementAt(0);
//链接过滤器加入代码
linkParser.setInputHTML(tag.toHtml());
//过滤获得链接内容list
NodeList list = linkParser.extractAllNodesThatMatch(linkFilter);
if(list.size() > 0){
//输出过滤链接
System.out.println(((LinkTag)list.elementAt(0)).extractLink());
}
}
//解析parser设置url
parser.setURL("http://www.downv.com/Windows/download-IgCodec-10400398.htm");
//添加自定义标签工厂
PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
//工厂添加自定义标签类
factory.registerTag(new HtmlParsePTag());
//解析添加工厂
parser.setNodeFactory(factory);
//获得过滤器获得内容
NodeList descTag = parser.extractAllNodesThatMatch(descFilter);
//便利内容
for(int i=0;i<descTag.size()&& descTag.size() >= 2;i++){
Node tag = descTag.elementAt(1);
//得到自定义标签内文本
System.out.println(tag.toPlainTextString());
break;
}
//自定义抓取标签
public class HtmlParsePTag extends CompositeTag {
private static final String mIds[] = { "p" };
private static final String mEndTagEnders[] = { "p" };
public HtmlParsePTag() {
}
public String[] getIds() {
return mIds;
}
public String[] getEndTagEnders() {
return mEndTagEnders;
}
}