简单的示例代码如下,从文件中读取内容进行去除停用词和词干提取,结果保存在result中。
Version matchVersion = Version.LUCENE_43;
BufferedReader stopwordsReader = new BufferedReader( new FileReader("a.txt"));
Analyzer analyzer = new StandardAnalyzer(matchVersion, stopwordsReader);
BufferedReader fileReader = null;
fileReader = new BufferedReader(new FileReader(new File(docName)));
List<String> result = new ArrayList<String>();
TokenStream ts = analyzer.tokenStream(null, fileReader);
ts = new PorterStemFilter(ts);
OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class);
CharTermAttribute charTermAttribute = ts.addAttribute(CharTermAttribute.class);
ts.reset();//必须的
while( ts.incrementToken() ){
int startOffset = offsetAttribute.startOffset();
int endOffset = offsetAttribute.endOffset();
String term = charTermAttribute.toString();
result.add(term);
System.out.println(term);
}
System.out.println(result.size());
ts.end();
ts.close();
参考: http://stackoverflow.com/questions/5391840/stemming-english-words-with-lucene