当用户输入连续的没有空格分隔的全拼时怎么分词?本分词基于汉语拼音的规则进行简单的分词。有以下缺点:
1.xi‘an(西安)这种会不分词,xi’nan(西南)会分成(xin an)新安。没有考虑词频语义。
2.如果其中有非拼音的字母,例如womendekpi,因为kpi无法拆分,则整个分词会失败。
//声母表
static String[] smb = new String[]{"b", "p", "m", "f", "d", "t", "l", "n", "g", "h", "k"
, "j", "q", "x", "z", "c", "s", "r", "y", "w", "zh", "ch", "sh"};
//韵母表
static String[] ymbmax = new String[]{
"iang", "iong", "uang",
"ang", "ong", "eng", "ing", "iao", "ian", "uai", "uan",
"an", "ao", "ai", "ou", "en", "er", "ei",
"ia", "iu", "ie", "in", "un", "ua", "uo", "ue", "ui",
"a", "o", "e", "i", "u", "v"
};
//独立成字韵母表
static String[] ymbmin = new String[]{
"ang", "ong", "eng", "ai", "an", "ao", "ou", "en", "er", "o", "a", "e"
};
//将汉语拼音连写s分割成String数组
public static String[] cut(String s) {
List<String> list = cut(s, 0);
if (list == null || list.isEmpty()) return new String[]{s};
int size = list.size();
if (list.get(size - 1).length() == 0) {
list.remove(size - 1);
}
return list.toArray(new String[0]);
}
private static List<String> cut(String s, int index) {
List<Integer> list = findWord(s, index);
if (list == null || list.size() == 0) return null;
for (int x : list) {
if (x == 0) {
return Collections.singletonList("");
}
List<String> left = cut(s, index + x);
if (left != null && left.size() > 0) {
List<String> ans = new ArrayList<>();
ans.add(s.substring(index, index + x));
ans.addAll(left);
return ans;
}
}
return null;
}
//找声母
private static int findSm(String s, int index) {
int n = s.length();
for (String asm : smb) {
if (s.startsWith(asm, index)) {
int nextidx = index + asm.length();
if (nextidx < n) {
String next = s.charAt(nextidx) + "";
boolean smAgain = false;
for (String asm2 : smb) {
if (next.equals(asm2)) {
smAgain = true;
break;
}
}
if (!smAgain) {
return asm.length();
}
}
}
}
return 0;
}
//找独立成字的韵母
private static List<Integer> findDlym(String s, int index) {
List<Integer> list = new ArrayList<>();
for (String ym : ymbmin) {
if (s.startsWith(ym, index)) {
list.add(ym.length());
}
}
return list;
}
//找韵母
private static List<Integer> findYm(String s, int index) {
List<Integer> list = new ArrayList<>();
for (String ym : ymbmax) {
if (s.startsWith(ym, index)) {
list.add(ym.length());
}
}
return list;
}
//找单字
private static List<Integer> findWord(String s, int index) {
if (index >= s.length()) return Collections.singletonList(0);
int len = findSm(s, index);
List<Integer> r = len == 0 ? findDlym(s, index) : findYm(s, index + len);
for (int i = 0, size = r.size(); i < size; i++) {
r.set(i, r.get(i) + len);
}
return r;
}