- import java.io.Reader;
- import java.util.Set;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.LetterTokenizer;
- import org.apache.lucene.analysis.LowerCaseFilter;
- import org.apache.lucene.analysis.StopAnalyzer;
- import org.apache.lucene.analysis.StopFilter;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.util.Version;
- import org.junit.Test;
- public class MyStopAnalyzer extends Analyzer {
- private Set stops;
- public MyStopAnalyzer(){
- //原来的停用词
- stops = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
- }
- public MyStopAnalyzer(String[] sws){
- //会自动将字符串数组转换为set
- stops = StopFilter.makeStopSet(Version.LUCENE_35, sws, true);
- //将原来的挺用词加入到现有的停用词集合
- stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
- }
- @Override
- public TokenStream tokenStream(String fieldName, Reader reader) {
- return new StopFilter(Version.LUCENE_35,
- new LowerCaseFilter(Version.LUCENE_35,
- new LetterTokenizer(Version.LUCENE_35, reader)),stops);
- }
- }
- import org.apache.lucene.analysis.Analyzer;
- import org.junit.Test;
- public class TestMyStopAnalyzer {
- @Test
- public void test04(){
- Analyzer mya = new MyStopAnalyzer(new String[]{"I","you"});
- String txt = "thank you ,how are you? i love you";
- AnalyzerUtil.displayToken(txt, mya);
- }
- }
- import java.io.IOException;
- import java.io.StringReader;
- import javax.swing.text.AttributeSet.CharacterAttribute;
- import org.apache.lucene.analysis.Analyzer;
- import org.apache.lucene.analysis.SimpleAnalyzer;
- import org.apache.lucene.analysis.StopAnalyzer;
- import org.apache.lucene.analysis.TokenStream;
- import org.apache.lucene.analysis.WhitespaceAnalyzer;
- import org.apache.lucene.analysis.standard.StandardAnalyzer;
- import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
- import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
- import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
- import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
- import org.apache.lucene.util.Version;
- import org.junit.Test;
- public class AnalyzerUtil {
- public static void displayToken(String txt,Analyzer a){
- TokenStream ts = a.tokenStream("content", new StringReader(txt));
- //位置增量的属性,存储词汇单元之间的距离
- PositionIncrementAttribute pia = ts.addAttribute(PositionIncrementAttribute.class);
- //每个词汇单元的位置偏移量
- OffsetAttribute oa = ts.addAttribute(OffsetAttribute.class);
- //存储每一个与会单元的信息(分词单元信息)
- CharTermAttribute ca = ts.addAttribute(CharTermAttribute.class);
- //是用的分词器的类型信息
- TypeAttribute ta = ts.addAttribute(TypeAttribute.class);
- try {
- while(ts.incrementToken()){
- System.out.println(ca.toString()+" positionincrement:"+pia.getPositionIncrement()+" "+"offset:"+oa.startOffset()+"-"+oa.endOffset()+" type:"+ta.type());
- }
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
- System.out.println("-----------");
- }
- }
mmseg中文分词器的使用,在这里我是用的分词器是1.8.5
- import java.io.StringReader;
- import org.apache.lucene.analysis.Analyzer;
- import org.junit.Test;
- import com.chenlb.mmseg4j.analysis.MMSegAnalyzer;
- public class TestMMsegAnalyzer {
- @Test
- public void test01(){
- Analyzer msgA = new MMSegAnalyzer();
- String txt = "白云山我来自中国,你好,我叫大工,";
- AnalyzerUtil.displayToken(txt, msgA);
- }
- }
转载于:https://blog.51cto.com/soukenan/1122724