1、到google下载IKAnalyzer2012http://code.google.com/p/ik-analyzer/downloads/list
2、使用maven相关jar包的下载
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>3.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-highlighter</artifactId>
<version>3.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-memory</artifactId>
<version>3.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers</artifactId>
<version>3.6.0</version>
</dependency>
<dependency>
<groupId>IKAnalyzer</groupId>
<artifactId>IKAnalyzer</artifactId>
<version>2012.u6</version>
</dependency>
3.
建立一个java测试项目scmsplitkw
拷贝下载包里面的stopword.dic、IKAnalyzer.cfg.xml到项目源码根目录
因为要测试自定义中文分词,在源码根目录创建文件ext.dic
配置IKAnalyzer.cfg.xml,在properties内添加
写道
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_dict">ext.dic;</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">stopword.dic;stopword_chinese.dic</entry>
4.
<entry key="ext_dict">ext.dic;</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">stopword.dic;stopword_chinese.dic</entry>
package com.iris.scm.lucene.test;
import java.io.StringReader;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class IKAnalyzerTest {
public static void main(String[] args) throws Exception {
String keyWord = "YAG晶体采用过滤阴极真空电弧技术制备非晶金刚石薄膜,细粒赤铁矿.
IKAnalyzer analyzer = new IKAnalyzer();
// 使用智能分词
analyzer.setUseSmart(true);
System.out.println("当前使用的分词器:" + analyzer.getClass().getSimpleName());
TokenStream tokenStream = analyzer.tokenStream("content", new StringReader(keyWord));
tokenStream.addAttribute(CharTermAttribute.class);
while (tokenStream.incrementToken()) {
CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
System.out.println(new String(charTermAttribute.buffer()));
}
}
}