1.分词器部分
/**
* "lucene分析器使用分词器和过滤器构成一个“管道”,文本在流经这个管道后成为可以进入索引的最小单位,
因此,一个标准的分析器有两个部分组成,一个是分词器tokenizer,它用于将文本按照规则切分为一个个可以进入索引的最小单位。
另外一个是TokenFilter,它主要作用是对切出来的词进行进一步的处理(如去掉敏感词、英文大小写转换、单复数处理)等。
lucene中的Tokenstram方法首先创建一个tokenizer对象处理Reader对象中的流式文本,然后利用TokenFilter对输出流进行过滤处理";
*/
~1. 测试各种分词器的分词原则
TokenStream stream=analyzer.tokenStream("content",new StringReader(s));
//根据分词的增加 把一个个的分词结果放到里面
CharTermAttribute charTermAttribute=stream.addAttribute(CharTermAttribute.class);
//位置增量值 词的单元距离
PositionIncrementAttribute po=stream.addAttribute(PositionIncrementAttribute.class);
//偏移量 一个词的起始位置和最后位置
OffsetAttribute offsetAttribute=stream.addAttribute(OffsetAttribute.class);
//分词器的类型
TypeAttribute typeAttribute=stream.addAttribute(TypeAttribute.class);
~2. 常见的分词器
//[i][like][liyp][i][limao][book][时][代][十][年][2372897][大][家][ask][88882]
Analyzer analyzer1=new StandardAnalyzer(Version.LUCENE_35);
//会过滤数字,并且中文不是分开的
//[i][like][liyp][i][limao][book][时代十年][大家ask]
Analyzer analyzer2=new StopAnalyzer(Version.LUCENE_35);
//[i][like][liyp][i][is][limao][this][is][book][时代十年][大家ask]
Analyzer analyzer3=new SimpleAnalyzer(Version.LUCENE_35);
//以空格来分词 不会过滤数字和中文
//[i][like][liyp,i][is][limao,this][is][book,时代十年,2372897][大家ask][88882]
Analyzer analyzer4=new WhitespaceAnalyzer(Version.LUCENE_35);
String s="i like liyp,i is limao,this is book,时代十年,2372897 大家ask 88882";
==================================================================================
~3.自定义分词器
@Test
public void test3(){
//定义需要停用的词
String[] s={"i","are","liyp"};
Analyzer01 analyzer01=new Analyzer01();
//实例继承Analyzer的子类 加上了stopAnalyzer默认停用词
Analyzer analyzer1=new Analyzer01(s);
String str="i like liyp,i is limao,this is book,时代十年,2372897 大家ask 88882";
analyzer01.test1(str, analyzer1);
}
public class Analyzer01 extends Analyzer{
private Set stops;
public Analyzer01(){};
public Analyzer01(String[] s){
stops=StopFilter.makeStopSet(Version.LUCENE_35, s, true);
System.out.println("StopFilter默认过滤的词:"+StopAnalyzer.ENGLISH_STOP_WORDS_SET);
//把默认的分词加到自己定义的分词器中
stops.addAll(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}
//继承Analyzer分词器
@Override
public TokenStream tokenStream(String arg0, Reader arg1) {
try {
//为自己的分词器定义过滤链
TokenStream stream= new StopFilter(Version.LUCENE_35,
new LowerCaseFilter(Version.LUCENE_35,
new LetterTokenizer(Version.LUCENE_35, arg1)), stops);
return stream;
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
return null;
}
}
==================================================================================
~4.同义词操作 加入同义词之后需要更新索引
public class Analyzer02 extends Analyzer{
Dictionary dic=null;
public Analyzer02(){
this.dic=Dictionary.getInstance();
}
@Override
public TokenStream tokenStream(String arg0, Reader arg1) {
return new MySameNameFilter(new MMSegTokenizer(new MaxWordSeg(dic), arg1));
}
}
public class MySameNameFilter extends TokenFilter{
private CharTermAttribute charTermAttribute;
private PositionIncrementAttribute incrementAttribute;
private Stack<String> stack=null;
private AttributeSource.State current;
public MySameNameFilter(TokenStream input) {
super(input);
charTermAttribute=input.addAttribute(CharTermAttribute.class);
incrementAttribute=input.addAttribute(PositionIncrementAttribute.class);
//要实例出来
stack=new Stack<String>();
}
@Override
public boolean incrementToken() throws IOException {
//判断栈中是否有值
if(stack.size()>0){
String s=stack.pop();
//还原状态
restoreState(current);
charTermAttribute.setEmpty();
charTermAttribute.append(s);
//设置位置
incrementAttribute.setPositionIncrement(0);
return true;
}
if(!input.incrementToken()){
return false;
}
if(getSame(charTermAttribute.toString())){
//保存当前状态
current=captureState();
}
return true;
}
//判断同义词库中是否有这个分词的同义词
public boolean getSame(String name){
Map<String,String[]> map=new HashMap<String, String[]>();
map.put("中国",new String[]{"大陆","天朝"});
map.put("我",new String[]{"咱","俺"});
if(map.get(name)!=null){
for (String string : map.get(name)) {
stack.push(string);
}
return true;
}
return false;
}
}