Lucene4.10.3自定义分词只需三步:
一、
package analyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Document;
import org.wltea.analyzer.lucene.IKTokenizer;
import java.io.*;
/**
* 自定义分词器
* Created with IntelliJ IDEA.
* User: wxshi
* Date: 15-2-11
* Time: 下午3:56
* To change this template use File | Settings | File Templates.
*/
public class MyAnalyzer extends Analyzer{
/**
* 自定义分词过程:
* 1.对流进行分词
* 2.对分词好的进行过滤处理
* 3.返回结果
* */
protected TokenStreamComponents createComponents(String s, Reader reader) {
IKTokenizer source = new IKTokenizer(reader,false); //使用IK进行分词 ,细粒度非智能
TokenStream filter = new MySynonymTokenFilter(source); //使用自己的同义词过滤器进行过滤
//filter = new BarFilter(filter); //此处可以继续过滤
return new TokenStreamComponents(source, filter); //返回结果 ,第一种构造 ,复杂时推荐使用
// return new TokenStreamComponents(new SynonymTokenIzer(reader)); //返回结果,第二种构造,简单时推荐使用
}
public static void main(String args[]){
try{
File file = new File("f:\\lucene\\indexFile\\文件1.txt");
Document doc = new Document();
FileInputStream fis = new FileInputStream(file);
Reader reader = new BufferedReader(new InputStreamReader(fis,"GBK"));
Analyzer analyzer = new MyAnalyzer();
TokenStream tokenStream = analyzer.tokenStream("content",reader); //ik分词流,不采用智