Lucene自带的分词器分词操作:
SimpleAnalyzer
StopAnalyzer
WhitespaceAnalyzer
StandardAnalyzer
package org.algorithm;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.util.Version;
public class IKAnalyzerDemo04 {
/*
* Lucene自带分词器进行分词操作
* */
public static void getIKAnalyzer(String str,Analyzer analyzer){
try{
//将一个字符串创建成Token流
TokenStream stream = analyzer.tokenStream("", new StringReader(str));
//保存相应词汇
CharTermAttribute cta = stream.addAttribute(CharTermAttribute.class);
stream.reset();
while(stream.incrementToken()){
System.out.print(cta+"|");
}
System.out.println();
}catch(IOException e){
e.printStackTrace();
}
}
@SuppressWarnings("resource")
public static void main(String[] args) throws IOException {
String path = "c:\\292.txt";
StringBuffer sb = new StringBuffer();
BufferedReader br = new BufferedReader(new FileReader(new File(path)));
String temp = "";
while((temp = br.readLine()) != null){
sb.append(temp);
sb.append("\r\n");
}
String str = sb.toString();
Analyzer analyzer1 = new StandardAnalyzer(Version.LUCENE_40);
Analyzer analyzer2 = new StopAnalyzer(Version.LUCENE_40);
Analyzer analyzer3 = new SimpleAnalyzer(Version.LUCENE_40);
Analyzer analyzer4 = new WhitespaceAnalyzer(Version.LUCENE_40);
getIKAnalyzer(str,analyzer1);
getIKAnalyzer(str,analyzer2);
getIKAnalyzer(str,analyzer3);
getIKAnalyzer(str,analyzer4);
}
}