项目结构
2.Test.java
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.cfg.DefaultConfig;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class Test {
public static void main(String[] args) throws IOException {
String text = "<三峡人家-清江画廊动车2日游>吊脚楼、民歌和土家幺妹、 住商圈酒店 跟";
// 创建分词对象
@SuppressWarnings("resource")
Analyzer anal = new IKAnalyzer(true); // true 用智能分词,false细粒度
Configuration cfg = DefaultConfig.getInstance();
// System.out.println(cfg.getMainDictionary()); // 系统默认词库
// System.out.println(cfg.getQuantifierDicionary());
Dictionary.initial(cfg);
List<String> list = new ArrayList<String>();
list.add("土家幺妹");
list.add("2日游");
list.add("三峡");
list.add("三峡人家");
Dictionary.getSingleton().addWords(list);
StringReader reader = new StringReader(text);
// 分词
TokenStream ts = anal.tokenStream("", reader);
CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
// 遍历分词数据
while (ts.incrementToken()) {
System.out.print(term.toString() + "|");
}
reader.close();
System.out.println();
}
}
IkSegmentation.java
import java.io.IOException;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.wltea.analyzer.lucene.IKAnalyzer;
public class IkSegmentation {
@SuppressWarnings("resource")
public static String Seg(String sentence) throws IOException {
String text = "";
// 创建分词对象
Analyzer anal = new IKAnalyzer(true);
StringReader reader = new StringReader(sentence);
// 分词
TokenStream ts = anal.tokenStream("", reader);
CharTermAttribute term = ts.getAttribute(CharTermAttribute.class);
// 遍历分词数据
while (ts.incrementToken()) {
text += term.toString() + "|";
}
reader.close();
return text.trim() + "\n";
}
@SuppressWarnings("static-access")
public static void main(String[] args) throws IOException {
IkSegmentation ik = new IkSegmentation();
System.out.println(ik.Seg("<三峡人家-清江画廊动车2日游>吊脚楼、民歌和土家幺妹、 住商圈酒店 跟"));
}
}
IKAnalyzer.cfg.xml
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE properties SYSTEM "http://java.sun.com/dtd/properties.dtd">
<properties>
<comment>IK Analyzer 扩展配置</comment>
<!--用户可以在这里配置自己的扩展字典 -->
<entry key="ext_dict">ext.dic;</entry>
<!--用户可以在这里配置自己的扩展停止词字典-->
<entry key="ext_stopwords">stopword.dic;</entry>
</properties>