Lucene是apache一个开源的搜索引擎,我的需求是对一篇文章抽取其摘要,本人菜鸟,实现代码如下
public static String luceneSummary(String txt) throws ParseException, IOException, InvalidTokenOffsetsException{
String pQuery= "穆 沙拉 法院"; //关键字
Formatter formatter = new SimpleHTMLFormatter("<font color='red'>","</font>"); //高亮
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_30);
QueryParser parser = new QueryParser(Version.LUCENE_30, "", analyzer);
Highlighter highlighter = new Highlighter(formatter , new QueryScorer(parser.parse(pQuery)));
// Fragmenter fragmenter = new SimpleFragmenter(100);
// highlighter.setTextFragmenter(fragmenter);
highlighter.setTextFragmenter(new NullFragmenter()); //不要限制子多少
String[] strArray = txt.split("([。|,])"); //以逗号分隔传入的文章,逐句的提取摘要,这样的摘要最起码是一个句子
String text = "";
for(String str : strArray){ //循环每句话
String ret = highlighter.getBestFragment(analyzer, "", str);
if(ret!=null){
text += ret+",";
if(text.length()>300) //如果摘要累计300个字就停止
break;
}
}
return text;
}
public static void main(String[] args) throws IOException, ParseException, InvalidTokenOffsetsException {
//args[0]传一篇文章试试就知道了
luceneSummary(args[0]);
}