Java汉字转拼音(包括多音字)
有个需求需要把汉字转拼音,我的小伙伴推荐用Unicode官方的包;下载有些慢。
实际中用了Java工具包:pinyin4j解决
可以转汉字,多音字,多音字的地方要求不太准确的,可以直接取列表的第一个值;
1. Maven依赖
<!--汉字转拼音-->
<dependency>
<groupId>com.belerweb</groupId>
<artifactId>pinyin4j</artifactId>
<version>2.5.1</version>
</dependency>
2. Java示例代码
import lombok.extern.slf4j.Slf4j;
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import java.util.*;
/*************************************
* Class Name: PinyinUtil
* Description:〈拼音转换工具类〉
* @author smn
* @create 2020/8/11
* @since 1.0.0
************************************/
@Slf4j
public class PinyinUtil {
/**
* 汉字转拼音(全拼)
*
* @param src
* @return
*/
public static String getPinyin(String src) {
char[] srcCharArray = src.toCharArray();
// 设置汉字拼音输出的格式
HanyuPinyinOutputFormat outputFormat = new HanyuPinyinOutputFormat();
outputFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
outputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
outputFormat.setVCharType(HanyuPinyinVCharType.WITH_V);
StringBuffer result = new StringBuffer();
for (int i = 0; i < srcCharArray.length; i++) {
String[] tempArray = null;
// 判断能否为汉字字符
if (Character.toString(srcCharArray[i]).matches("[\\u4E00-\\u9FA5]+")) {
try {
tempArray = PinyinHelper.toHanyuPinyinStringArray(srcCharArray[i], outputFormat);
} catch (BadHanyuPinyinOutputFormatCombination e) {
log.error("处理" + src + "出错", e);
}
// 将汉字的几种全拼都存到数组中
result.append(tempArray[0]);
} else {
// 如果不是汉字字符,间接取出字符并连接到字符串后
result.append(Character.toString(srcCharArray[i]));
}
result.append(" ");
}
return result.toString().trim();
}
/**
* 返回不带空格分割的全拼
*
* @param src
* @return
*/
public static String getPinyinWithoutBlank(String src) {
return getPinyin(src).replaceAll(" ", "");
}
/**
* 返回多音字的全部拼音(不区分声调)
*
* @param src
* @return
*/
public static List<String> getMultiplePronounciationsWithoutTone(String src) {
List<String> dstPinyinList = new ArrayList<String>();
List<String> tempPinyinList = new ArrayList<String>();
String[] curCharPinyin = null;
Set<String> curPinyinSet = null;
// 设置汉字拼音输出的格式
HanyuPinyinOutputFormat outputFormat = new HanyuPinyinOutputFormat();
outputFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
outputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
outputFormat.setVCharType(HanyuPinyinVCharType.WITH_V);
char[] srcCharArray = src.toCharArray();
for (char curChar : srcCharArray) {
// 判断能否为汉字字符
if (Character.toString(curChar).getBytes().length != Character.toString(curChar).length()) {
if (Character.toString(curChar).matches("[\\u4E00-\\u9FA5]+")) {
try {
curCharPinyin = PinyinHelper.toHanyuPinyinStringArray(curChar, outputFormat);
if (null == curCharPinyin) {
log.error("[" + Character.toString(curChar) + "]字转换拼音失败:转换结果为空!");
return null;
}
// 集合用于去除声调不同的重复拼音
curPinyinSet = new HashSet<String>();
for (int i = 0; i < curCharPinyin.length; i++) {
if (!curPinyinSet.contains(curCharPinyin)) {
curPinyinSet.add(curCharPinyin[i]);
} else {
continue;
}
}
} catch (BadHanyuPinyinOutputFormatCombination e) {
log.error("[" + Character.toString(curChar) + "]字转换拼音失败:" + e.getMessage(), e);
return null;
}
} else {
// 不在Unicode汉字编码内的字符,返回null
log.error("[" + Character.toString(curChar) + "]字转换拼音失败:转换结果为空!");
return null;
}
} else {
// 如果不是汉字字符,则直接将字符放入当前字符拼音集合
curPinyinSet = new HashSet<String>();
curPinyinSet.add(Character.toString(curChar));
}
// 进行输出拼音字串拼接
Iterator<String> iter = null;
if (dstPinyinList.size() == 0) {
iter = curPinyinSet.iterator();
while (iter.hasNext()) {
String curPinyin = (String) iter.next();
dstPinyinList.add(curPinyin);
}
} else {
for (String dstPinyin : dstPinyinList) {
iter = curPinyinSet.iterator();
while (iter.hasNext()) {
String curPinyin = (String) iter.next();
tempPinyinList.add(dstPinyin + " " + curPinyin);
}
}
dstPinyinList.clear();
dstPinyinList.addAll(tempPinyinList);
tempPinyinList.clear();
}
}
return dstPinyinList;
}
public static List<String> getMultiplePronounciationsWithoutToneWithoutBlank(String src) {
List<String> result = new ArrayList<String>();
List<String> data = getMultiplePronounciationsWithoutTone(src);
for (String str : data) {
result.add(str.replaceAll(" ", ""));
}
return result;
}
public static List<String> getJianPin(String src) {
List<String> result = new ArrayList<String>();
List<String> tempList = getMultiplePronounciationsWithoutTone(src);
StringBuffer sb = new StringBuffer();
for (String str : tempList) {
String[] array = str.split(" ");
for (String s : array) {
sb.append(s.charAt(0));
}
result.add(sb.toString().toLowerCase());
sb.delete(0, sb.length());
}
return result;
}
public static void main(String[] args) {
String str = "满地都是六便士";
System.out.println(PinyinUtil.getPinyin(str));
System.out.println(PinyinUtil.getPinyinWithoutBlank(str));
System.out.println(PinyinUtil.getMultiplePronounciationsWithoutTone(str));
System.out.println(PinyinUtil.getMultiplePronounciationsWithoutToneWithoutBlank(str));
System.out.println(PinyinUtil.getJianPin(str));
log.info(" " + PinyinUtil.getJianPin("1597219781375_1.大型互联网系统架构演进之路.mp4"));
}
}
3. 结果:
参考: