##1 pom文件中导入具体jar包
具体pom文件如下:
<dependencies>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>7.2.0</version>
</dependency>
<!--一般分词器,适用于英文分词-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>7.2.0</version>
</dependency>
<!--中文分词器-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-smartcn</artifactId>
<version>7.2.0</version>
</dependency>
<!--对分词索引查询解析-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-queryparser</artifactId>
<version>7.2.0</version>
</dependency>
<!--检索关键字高亮显示-->
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>7.2.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.janeluo/ikanalyzer -->
<dependency>
<groupId>com.janeluo</groupId>
<artifactId>ikanalyzer</artifactId>
<version>2012_u6</version>
</dependency>
</dependencies>
##2 引入IK分词器
在工程的resource目录下,新建IK配置文件IKAnalyzer.cfg.xml
,配置如下:
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>IK Analyzer 扩展配置</comment>
<!--用户可以在这里配置自己的扩展字典-->
<entry key="ext_dict">ext.dic;</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">stopword.dic;ext_stopword.dic</entry>
</properties>
resource目录下,新建配置文件中的ext.dic
、stopword.dic
、ext_stopword.dic
##3 IK分词器使用
Lucene使用IK分词器需要修改IKAnalyzer
和IKTokenizer
。
新建类IKAnalyzer6x
继承自IKAnalyzer
,代码如下:
package com.fulb.lucene.ik;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
/**
* @author fulibao
* @version 1.0
* @created 2018/2/18 下午10:25
**/
public class IKAnalyzer6x extends Analyzer {
private boolean useSmart;
public boolean useSmart() {
return useSmart;
}
public void setUseSmart(boolean useSmart) {
this.useSmart = useSmart;
}
// IK分词器Lucene Analyzer接口实现类;默认细粒度切分算法
public IKAnalyzer6x() {
this(false);
}
// IK分词器Lucene Analyzer接口实现类;当为true时,分词器进行智能切分
public IKAnalyzer6x(boolean useSmart) {
super();
this.useSmart = useSmart;
}
// 重写最新版本的createComponents;重载Analyzer接口,构造分词组件
@Override
protected TokenStreamComponents createComponents(String fieldName) {
Tokenizer _IKTokenizer = new IKTokenizer6x(this.useSmart());
return new TokenStreamComponents(_IKTokenizer);
}
}
新建类IKTokenizer6x
继承自IKTokenizer
,代码示例如下:
package com.fulb.lucene.ik;
import java.io.IOException;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.wltea.analyzer.core.IKSegmenter;
import org.wltea.analyzer.core.Lexeme;
/**
* @author fulibao
* @version 1.0
* @created 2018/2/18 下午6:25
**/
public class IKTokenizer6x extends Tokenizer {
// IK分词器实现
private IKSegmenter _IKImplement;
// 词元文本属性
private final CharTermAttribute termAtt;
// 词元位移属性
private final OffsetAttribute offsetAtt;
// 词元分类属性(该属性分类参考org.wltea.analyzer.core.Lexeme中的分类常量)
private final TypeAttribute typeAtt;
// 记录最后一个词元的结束位置
private int endPosition;
// Lucene 6.x Tokenizer适配器类构造函数;实现最新的Tokenizer接口
public IKTokenizer6x(boolean useSmart) {
super();
offsetAtt = addAttribute(OffsetAttribute.class);
termAtt = addAttribute(CharTermAttribute.class);
typeAtt = addAttribute(TypeAttribute.class);
_IKImplement = new IKSegmenter(input, useSmart);
}
@Override
public boolean incrementToken() throws IOException {
// 清除所有的词元属性
clearAttributes();
Lexeme nextLexeme = _IKImplement.next();
if (nextLexeme != null) {
// 将Lexeme转成Attributes
// 设置词元文本
termAtt.append(nextLexeme.getLexemeText());
// 设置词元长度
termAtt.setLength(nextLexeme.getLength());
// 设置词元位移
offsetAtt.setOffset(nextLexeme.getBeginPosition(), nextLexeme.getEndPosition());
// 记录分词的最后位置
endPosition = nextLexeme.getEndPosition();
// 记录词元分类
typeAtt.setType(nextLexeme.getLexemeText());
// 返会true告知还有下个词元
return true;
}
// 返会false告知词元输出完毕
return false;
}
@Override
public void reset() throws IOException {
super.reset();
_IKImplement.reset(input);
}
@Override
public final void end() {
int finalOffset = correctOffset(this.endPosition);
offsetAtt.setOffset(finalOffset, finalOffset);
}
}
##4 分词器使用代码示例
一段使用Lucene进行分词的代码示例
package com.fulb.lucene.analyzer;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.Paths;
import com.fulb.lucene.ik.IKAnalyzer6x;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.cjk.CJKAnalyzer;
import org.apache.lucene.analysis.cn.smart.SmartChineseAnalyzer;
import org.apache.lucene.analysis.core.KeywordAnalyzer;
import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.core.StopAnalyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
/**
* @author fulibao
* @version 1.0
* @created 2018/2/18 下午5:47
**/
public class VariousAnalyzers {
private static String str ="中华人民共和国简称中国, 是一个有13亿人口的国家";
public static void main(String[] args) throws IOException {
Analyzer analyzer = null;
analyzer = new StandardAnalyzer();// 标准分词
System.out.println("标准分词:" + analyzer.getClass());
printAnalyzer(analyzer);
analyzer = new WhitespaceAnalyzer(); // 空格分词
System.out.println("空格分词:" + analyzer.getClass());
printAnalyzer(analyzer);
analyzer = new SimpleAnalyzer(); // 简单分词
System.out.println("简单分词:" + analyzer.getClass());
printAnalyzer(analyzer);
analyzer = new CJKAnalyzer(); // 二分法分词
System.out.println("二分法分词:" + analyzer.getClass());
printAnalyzer(analyzer);
analyzer = new KeywordAnalyzer(); // 关键字分词
System.out.println("关键字分词:" + analyzer.getClass());
printAnalyzer(analyzer);
// analyzer = new StopAnalyzer(Paths.get(VariousAnalyzers.class.getClassLoader().getResource("stopword.dic").getPath()));
analyzer = new StopAnalyzer();
// 停用词分词
System.out.println("停用词分词:" + analyzer.getClass());
printAnalyzer(analyzer);
analyzer = new SmartChineseAnalyzer(); // 中文智能分词
System.out.println("中文智能分词:" + analyzer.getClass());
printAnalyzer(analyzer);
analyzer = new IKAnalyzer6x(); // IK分词(细粒度切分算法)
System.out.println("IK分词(细粒度切分算法):" + analyzer.getClass());
printAnalyzer(analyzer);
analyzer = new IKAnalyzer6x(true); // IK分词(智能切分算法)
System.out.println("IK分词(智能切分算法):" + analyzer.getClass());
printAnalyzer(analyzer);
}
public static void printAnalyzer(Analyzer analyzer) throws IOException {
StringReader reader = new StringReader(str);
TokenStream toStream = analyzer.tokenStream(str, reader);
toStream.reset();// 清空流
CharTermAttribute teAttribute = toStream.getAttribute(CharTermAttribute.class);
while (toStream.incrementToken()) {
System.out.print(teAttribute.toString() + "|");
}
System.out.println("\n");
analyzer.close();
}
}