文章中需要的Apache openNlp jar包:openNlp下载:https://opennlp.apache.org/cgi-bin/download.cgi
其他涉及的jar都是java基础类包
package com.npl.demo.utils;
import java.io.IOException;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.text.BreakIterator;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Locale;
import java.util.Scanner;
import java.util.StringTokenizer;
import java.util.regex.Pattern;
import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.WhitespaceTokenizer;
/**
* Filename: NlpTokenization.java
* Description:
* Copyright: Copyright (c) 2019 All Rights Reserved.
* @author: wangk
* @version: 1.0
* Create at: 2019年5月5日 下午4:28:56
*
* Modification History:
* Date Author Version Description
* ------------------------------------------------------------------
* 2019年5月5日 wangk 1.0 1.0 Version
*
*/
public class NlpTokenization {
static String paragraph = "Let's The first sentence. The second sentence. Let's ";
static String[] sentences = {
"Tim was agood neighbor. Perhaps not as good Bob "+
"Haywood, but stille pretty good. Of course Mr. Adam "+
"took the cake!"
};
static String chineseLanguage = "时代的碰撞|中国古典民乐与流行的相遇"; //中文可以进行正则匹配每隔字中间加一个空格,就可以进行分词了
//代码如下
/*String regex = "(.{1})";
text = text.replaceAll (regex, "$1 ");*/
public static void main(Str