public class MyAnalyzer extends Analyzer {
private CharArraySet stopsArraySet=null;
public MyAnalyzer() {
super();
// TODO Auto-generated constructor stub
}
public MyAnalyzer(CharArraySet stopsArraySet) {
super();
this.stopsArraySet = stopsArraySet;
}
@Override
protected TokenStreamComponents createComponents(String fieldName,
Reader reader) {
StandardTokenizer srcStandardTokenizer=new StandardTokenizer(Version.LUCENE_45, reader);
TokenStream tokenStream=new StandardFilter(Version.LUCENE_45, srcStandardTokenizer);
tokenStream=new LowerCaseFilter(Version.LUCENE_45, tokenStream);
tokenStream=new StopFilter(Version.LUCENE_45, tokenStream,stopsArraySet);
tokenStream=new MyTokenFilter(tokenStream);
return new TokenStreamComponents(srcStandardTokenizer, tokenStream);
}
}
public void analysis( String str,Analyzer analyzer)
{
try {
ts=analyzer.tokenStream("content", new StringReader(str));
CharTermAttribute cta=ts.addAttribute(CharTermAttribute.class);
PositionIncrementAttribute pia=ts.addAttribute(PositionIncrementAttribute.class);
OffsetAttribute oa=ts.addAttribute(OffsetAttribute.class);
TypeAttribute ta=ts.addAttribute(TypeAttribute.class);
PositionLengthAttribute positionLengthAttribute=ts.addAttribute(PositionLengthAttribute.class);
ts.reset();
while(ts.incrementToken())
{
System.out.println("["+cta+"]:"+pia.getPositionIncrement()+" "
+oa.startOffset()+" "+oa.endOffset()+" "+ta.type()+" "+
positionLengthAttribute.getPositionLength());
}
ts.end();
} catch (IOException e) {
e.printStackTrace();
}finally{
if(ts!=null)
try {
ts.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
public final class MyTokenFilter extends TokenFilter {
private CharTermAttribute charTermAttribute;
private PositionIncrementAttribute positionIncrementAttribute;
private State currentstate;
private Stack<String> sames=new Stack<String>();
protected MyTokenFilter(TokenStream input) {
super(input);
charTermAttribute=input.addAttribute(CharTermAttribute.class);
positionIncrementAttribute=input.addAttribute(PositionIncrementAttribute.class);
// TODO Auto-generated constructor stub
}
@Override
public boolean incrementToken() throws IOException {
// TODO Auto-generated method stub
if(sames.size()>0)
{
restoreState(currentstate);
charTermAttribute.setEmpty();
String top=sames.pop();
charTermAttribute.append(top);
positionIncrementAttribute.setPositionIncrement(0);
return true;
}
if(!input.incrementToken())
return false;
getsamewords(charTermAttribute.toString());
return true;
}
public void getsamewords(String name)
{
Map<String,String[]> maps=new HashMap<String, String[]>();
maps.put("zhongguo",new String[]{"dalu","tianchao"});
maps.put("wo", new String[]{"zan","me"});
String[] searchStrings=maps.get(name);
if(searchStrings==null)
return;
for(int i=0;i<maps.get(name).length;i++)
{
sames.push(maps.get(name)[i]);
}
currentstate=captureState();
}
}
测试:
public void TestStandAnalyzer()
{
AnalyzerUtil auAnalyzerUtil=new AnalyzerUtil();
LinkedList<java.lang.String> stopList=new LinkedList<java.lang.String>();
stopList.add("this");
stopList.add("is");
stopList.add("a");
stopList.add("that");
stopList.add("like");
CharArraySet stopwordSet=new CharArraySet(Version.LUCENE_45,stopList, true);
auAnalyzerUtil.analysis("This is a dog. That is a cat. wo like football but wo don't like " +
"basketball. " +"zhongguo is greate country.",
new MyAnalyzer(new CharArraySet(Version.LUCENE_45,stopwordSet, true)));
}
输出:
[dog]:4 10 13 <ALPHANUM> 1
[cat]:4 25 28 <ALPHANUM> 1
[wo]:1 30 32 <ALPHANUM> 1
[me]:0 30 32 <ALPHANUM> 1
[zan]:0 30 32 <ALPHANUM> 1
[football]:2 38 46 <ALPHANUM> 1
[but]:1 47 50 <ALPHANUM> 1
[wo]:1 51 53 <ALPHANUM> 1
[me]:0 51 53 <ALPHANUM> 1
[zan]:0 51 53 <ALPHANUM> 1
[don't]:1 54 59 <ALPHANUM> 1
[basketball]:2 65 75 <ALPHANUM> 1
[zhongguo]:1 77 85 <ALPHANUM> 1
[tianchao]:0 77 85 <ALPHANUM> 1
[dalu]:0 77 85 <ALPHANUM> 1
[greate]:2 89 95 <ALPHANUM> 1
[country]:1 97 104 <ALPHANUM> 1