java 中文文本分词
本文使用 classifier4J 以及 IKAnalyzer2012_u6 实现中文分词。可以增加自定义词库,词库保存为 “exdict.dic” 文件,一个词一行。
// MyTokenizer.java 文件
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import net.sf.classifier4J.ITokenizer;
import org.apache.lucene.analysis.tokenattributes.TermAttribute;
import org.wltea.analyzer.cfg.Configuration;
import org.wltea.analyzer.cfg.DefaultConfig;
import org.wltea.analyzer.dic.Dictionary;
import org.wltea.analyzer.lucene.IKTokenizer;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
/**
* 中文分词器类
*
* @author CSD
*
*/
@SuppressWarnings("deprecation")
public class MyTokenizer implements ITokenizer {
private static final Logger logger = LogManager.getLogger(MyTokenizer.class);
private List list;
private String[] strArray;
private static Collection exwordc = new ArrayList<>();
private static String exdict = "exdict.dic";
// 加载新增词库
static {
try {
File file = new File(exdict);
FileInputStream fin = new FileInputStream(file);
BufferedReader reader = new BufferedReader(new InputStreamReader(fin));
String line = "";
while ((line = reader.readLine()) != null) {
exwordc.add(line.trim());
}
reader.close();
logger.info("加载词典::" + exdict);
// 增加词库
Configuration cfg = DefaultConfig.getInstance();
Dictionary dict = Dictionary.initial(cfg);
dict.addWords(exwordc);
} catch (IOException e) {
logger.error(e + "------------------加载词典出错,请确认词典文件!------------------");
}
}
/**
* 分词,返回分词数组
*
* @param input
* 文本字符串
* @return String[]
*/
public String[] tokenize(String input) {
list = new ArrayList();
IKTokenizer tokenizer = new IKTokenizer(new StringReader(input), true);
try {
while (tokenizer.incrementToken()) {
TermAttribute termAtt = (TermAttribute) tokenizer.getAttribute(TermAttribute.class);
String str = termAtt.term();
list.add(str);
}
} catch (IOException e) {
logger.error(e + "------------------分词出错------------------");
}
strArray = new String[list.size()];
for (int i = 0; i < list.size(); i++) {
strArray[i] = (String) list.get(i);
}
return strArray;
}
}
// Segmentation.java
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import net.sf.classifier4J.ITokenizer;
/**
* 中文语料分词
*
* @author CSD
*
*/
public class Segmentation {
private static final Logger logger = LogManager.getLogger(Segmentation.class);
public static void main(String[] args) throws IOException {
String path = "1.txt";
File file = new File(path);
FileInputStream fin = new FileInputStream(file);
String input = getString(fin);
logger.info("开始分词::" + path);
ITokenizer tokenizer = new MyTokenizer();
String[] words = tokenizer.tokenize(input);
for (String word : words) {
System.out.println(word);
}
}
/**
* 从 inputStream 读取文本并转为一个字符串。
*
* @param is
* inputStream 输入流
* @return String 文本字符串
* @throws IOException
*/
public static String getString(InputStream is) throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(is));
String line = "";
StringBuffer stringBuffer = new StringBuffer();
while ((line = reader.readLine()) != null) {
stringBuffer.append(line);
stringBuffer.append(" ");
}
reader.close();
return stringBuffer.toString().trim();
}
}
程序需依赖 IKAnalyzer2012_u6.jar 以及添加 pom.xml 文件
classifier4j
classifier4j
0.6
org.apache.lucene
lucene-analyzers
3.2.0
org.slf4j
slf4j-log4j12
1.7.5