源码地址:http://download.csdn.net/detail/yx511500623/6258163
开发环境
eclipse for jee+jdk7+tomcat7
lucene4.4+crawler4j3.5
索引文件位置:/csdn-blog-crawler/data
记得把生成的索引放入:/csdn-blog-crawler/WebContent
关键code如下:
/csdn-blog-crawler/src/cn/crawler/lucene/util/HtmlUtil.java
package cn.crawler.lucene.util;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.CssSelectorNodeFilter;
import org.htmlparser.util.NodeList;
public class HtmlUtil{
public static String getText(String html,String id){
try {
Parser parser = new Parser(html);
NodeFilter filter=new CssSelectorNodeFilter("#"+id);
NodeList nList=parser.extractAllNodesThatMatch(filter);
return nList==null||nList.size()==0?null: nList.elementAt(0).toPlainTextString();
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
public static String getTextByClass(String html,String css_class){
try {
Parser parser = new Parser(html);
NodeFilter filter=new CssSelectorNodeFilter("."+css_class);
NodeList nList=parser.extractAllNodesThatMatch(filter);
return nList==null||nList.size()==0?null: nList.elementAt(0).toPlainTextString();
} catch (Exception e) {
e.printStackTrace();
return null;
}
}
public static String filterText(String text){
if(text==null) return null;
text=text.replace(">",">");
text=text.replace("<","<");
text=text.replace(""","\"");
text=text.replace(" "," ");
text=text.replace("&","&");
text=text.replace("©","©");
text=text.replace(" ","");
return text;
}
}
截图如下: