pinyin4j
- 添加maven依赖
<dependency>
<groupId>com.belerweb</groupId>
<artifactId>pinyin4j</artifactId>
<version>2.5.0</version>
</dependency>
- 获取文本拼音
private String pinyinTest(String context, boolean existNotPinyin) {
if (context == null || context.trim().length() <= 0) {
return null;
}
HanyuPinyinOutputFormat outputFormat = new HanyuPinyinOutputFormat();
outputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
char[] chars = context.trim().toCharArray();
StringBuilder builder = new StringBuilder();
try {
for (char aChar : chars) {
String[] pinyin = PinyinHelper.toHanyuPinyinStringArray(aChar, outputFormat);
if (pinyin == null || pinyin.length <= 0) {
if (existNotPinyin) {
builder.append(aChar);
}
continue;
}
builder.append(pinyin[0]);
}
} catch (BadHanyuPinyinOutputFormatCombination e) {
e.printStackTrace();
}
return builder.toString().toUpperCase();
}
- 测试用例
@Test
public void test() {
String temp = "我爱罗52";
String list = pinyinTest(temp, false);
}
hanlp
- 添加Maven依赖
<dependency>
<groupId>com.hankcs</groupId>
<artifactId>hanlp</artifactId>
<version>portable-1.8.4</version>
</dependency>
- 获取文本拼音
private String hanLpTest(String content, Boolean existNotPinyin) {
if (context == null || context.trim().length() <= 0) {
return null;
}
if (existNotPinyin) {
return HanLP.convertToPinyinString(content, "", false).toUpperCase();
}
List<Pinyin> pinyinList = HanLP.convertToPinyinList(content);
StringBuilder builder = new StringBuilder();
pinyinList.forEach(pinyin -> {
if (pinyin == null || Pinyin.none5.equals(pinyin)) {
return;
}
builder.append(pinyin.getPinyinWithoutTone());
});
return builder.toString().toUpperCase();
}
- 测试用例
@Test
public void test() {
String temp = "我爱罗52";
System.out.println(hanLpTest(temp, true));
}
关键字分词
- 正则表达式
private final String SPLIT_WORD_REG_EX = "[^aoeiuv]?h?[iuv]?(ai|ei|ao|ou|er|ang?|eng?|ong|a|o|e|i|u|ng|n)?";
- 获取分词结果
private List<String> splitTest(String keyword) {
if (context == null || context.trim().length() <= 0) {
return Collections.emptyList();
}
List<String> keywordList = new ArrayList<>();
int index = 0;
Pattern pat = Pattern.compile(SPLIT_WORD_REG_EX);
for (int i = keyword.length(); i > 0; i = i - index) {
Matcher matcher = pat.matcher(keyword);
if (!matcher.find()) {
break;
}
keywordList.add(matcher.group());
index = matcher.end() - matcher.start();
keyword = keyword.substring(index);
}
return keywordList;
}
- 测试用例
@Test
public void test() {
String temp = "我爱罗52";
List<String> list = splitTest(temp);
System.out.println(list);
}