网页中的标题、keywords和Description这三项对于建立网页索引是非常重要的。以下是用htmlparser对网页进行解析来得到这三个值。
try {
NodeFilter title_filter = new TagNameFilter("title");
NodeFilter meta_filter = new TagNameFilter("meta");
OrFilter filters = new OrFilter();
filters.setPredicates(new NodeFilter[]{title_filter,meta_filter});
Parser parser = new Parser();
parser.setURL("D:\\test.html");
parser.setEncoding(parser.getEncoding());
NodeList list = parser.extractAllNodesThatMatch(filters);
for (int i = 0; i < list.size(); i++) {
Tag tag=(Tag) list.elementAt(i);
if(tag instanceof MetaTag){
String name=tag.getAttribute("name");
if(name!=null&&name.equalsIgnoreCase("Keywords")){
System.out.println("Keywords : "+tag.getAttribute("content"));
}
if(name!=null&&name.equalsIgnoreCase("Description")){
System.out.println("Description"+" : "+tag.getAttribute("content"));
}
}else if(tag instanceof TitleTag){
System.out.println("Title : "+tag.getText());
}
}
} catch (Exception e) {
e.printStackTrace();
}