java将汉字字符串转换为拼音(包含多音字)

Github个人博客:https://joeyos.github.io

汉字转换为拼音

用一种简单的方法将字符串转化为拼音:

  1. 将需要转换的字符串t1里的字符t1[i]按照t3的格式格式化为拼音,并复制给t2
  2. 如果t1[i]不是汉字,则不转换,直接把t1[i]复制给t2
  3. 将t2首字母大写,复制给t4

这里将用到pinyin4j.jar包,请自行百度下载。

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

public class Chinese2Pinyin {
	public static String getPinyin(String src) {
		char[] t1 = null;
		t1 = src.toCharArray();
		String[] t2 = new String[t1.length];
		HanyuPinyinOutputFormat t3 = new HanyuPinyinOutputFormat();
		t3.setCaseType(HanyuPinyinCaseType.LOWERCASE);// 小写格式
		t3.setToneType(HanyuPinyinToneType.WITHOUT_TONE);// 有无音标
		t3.setVCharType(HanyuPinyinVCharType.WITH_V);
		String t4 = "";
		try {
			for (int i = 0; i < t1.length; i++) {
				// 判断是否为汉字字符
				// if(t1[i] >= 32 && t1[i] <= 125)//ASCII码表范围内直接返回
				if (String.valueOf(t1[i]).matches("[\\u4E00-\\u9FA5]+")) {
					t2 = PinyinHelper.toHanyuPinyinStringArray(t1[i], t3);// 转化为拼音
					t4 += t2[0].substring(0, 1).toUpperCase() + t2[0].substring(1);// 首字母大写
				} else {
					t4 += String.valueOf(t1[i]);// 不是汉字不处理
				}
			}
		} catch (BadHanyuPinyinOutputFormatCombination e1) {
			e1.printStackTrace();
		}
		return t4;
	}

	public static void main(String[] args) {
		String s = getPinyin("西安电子科技大学");
		System.out.println(s);
		String s1 = getPinyin("西安");
		System.out.println(s1);
		String s2 = getPinyin("成都");
		System.out.println(s2);// ChengDu
	}
}

运行结果为:

XiAnDianZiKeJiDaXue
XiAn
ChengDou

可以看到,“成都(du)”被转化成了“ChengDou”,这是因为pinyin4j.jar无法识别多音字的原因。如果是多音字,t2[k]存放着某个字符t1[i]的多个拼音,是一个数组,默认取t2[0]为该字的拼音。

多音字处理

这里采用查字典的方法,将需要转换的字符子串与字典里的词组进行匹配,如果匹配到,这把字典里的读音作为改字的拼音。单字随机分配发音,如果是字典里的常用单字,则按照字典发音。
首先,下载多音字字典文件:下载字典

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.HashMap;
import java.util.Map;

import org.apache.commons.lang.StringUtils;

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

public class Chinese2Pinyin {
	public static Map<String, String> dictionary = new HashMap<String, String>();
	static String filePath = "C:\\dict\\duoyinzi_pinyin.txt";
	// 加载多音字词典
	static {
		BufferedReader br = null;
		try {
			File file = new File(filePath);
			br = new BufferedReader(new InputStreamReader(new FileInputStream(file), "UTF-8"));
			String line = null;
			while ((line = br.readLine()) != null) {
				String[] arr = line.split("#");
				if (StringUtils.isNotEmpty(arr[1])) {
					String[] sems = arr[1].split(" ");
					for (String sem : sems) {
						if (StringUtils.isNotEmpty(sem)) {
							dictionary.put(sem, arr[0]);
						}
					}
				}
			}
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		} catch (FileNotFoundException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (br != null) {
				try {
					br.close();
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	}

	public static String getPinyin(String src) {
		char[] t1 = null;
		t1 = src.toCharArray();
		String[] t2 = new String[t1.length];
		HanyuPinyinOutputFormat t3 = new HanyuPinyinOutputFormat();
		t3.setCaseType(HanyuPinyinCaseType.LOWERCASE);// 小写格式
		t3.setToneType(HanyuPinyinToneType.WITHOUT_TONE);// 有无音标
		t3.setVCharType(HanyuPinyinVCharType.WITH_V);
		String t4 = "";
		try {
			for (int i = 0; i < t1.length; i++) {
				// 判断是否为汉字字符
				// if(t1[i] >= 32 && t1[i] <= 125)//ASCII码表范围内直接返回
				if (String.valueOf(t1[i]).matches("[\\u4E00-\\u9FA5]+")) {
					t2 = PinyinHelper.toHanyuPinyinStringArray(t1[i], t3);// 转化为拼音
					//如果是单个汉字,不处理,随机分配拼音
					if (i != t1.length - 1 && t1.length != 1) {
						String dic = String.valueOf(t1[i]) + String.valueOf(t1[i + 1]);
						for (String py : t2) {
							if (py.equals(dictionary.get(dic)) || py.equals(dictionary.get(String.valueOf(t1[i])))) {
								t2[0] = py;// 把t2[0]作为存放正确发音的区域
								break;
							}
						}
					} else if (t1.length != 1) {
						String dic = String.valueOf(t1[i - 1]) + String.valueOf(t1[i]);
						for (String py : t2) {
							if (py.equals(dictionary.get(dic)) || py.equals(dictionary.get(String.valueOf(t1[i])))) {
								t2[0] = py;// 把t2[0]作为存放正确发音的区域
								break;
							}
						}
					}
					t4 += t2[0].substring(0, 1).toUpperCase() + t2[0].substring(1);// 首字母大写
				} else {
					t4 += String.valueOf(t1[i]);// 不是汉字不处理
				}
			}
		} catch (BadHanyuPinyinOutputFormatCombination e1) {
			e1.printStackTrace();
		}
		return t4;
	}

	public static void main(String[] args) {
		String s = getPinyin("西安电子科技大学");
		System.out.println(s);
		String s1 = getPinyin("西安");
		System.out.println(s1);
		String s2 = getPinyin("成都");
		System.out.println(s2);// ChengDu
	}
}

运行结果为:

XiAnDianZiKeJiDaXue
XiAn
ChengDu
相关推荐
package oa.common.utils; import net.sourceforge.pinyin4j.PinyinHelper; import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType; import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat; import net.sourceforge.pinyin4j.format.HanyuPinyinToneType; import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType; import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination; /** * 文件名:PingYinUtil.java 拼音工具类 * 版本信息:V1.0 * 日期:2013-06-18 * Copyright BDVCD Corporation 2013 * 版权所有 http://www.bdvcd.com * */ public class PingYinUtil { /** * 字符串中的中文转化拼音,其他字符不变 * * @param inputString * @return */ public static String getPingYin(String inputString) { HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat(); format.setCaseType(HanyuPinyinCaseType.LOWERCASE); format.setToneType(HanyuPinyinToneType.WITHOUT_TONE); format.setVCharType(HanyuPinyinVCharType.WITH_V); char[] input = inputString.trim().toCharArray(); String output = ""; try { for (int i = 0; i < input.length; i++) { if (java.lang.Character.toString(input[i]).matches("[\\u4E00-\\u9FA5]+")) { String[] temp = PinyinHelper.toHanyuPinyinStringArray(input[i], format); output += temp[0]; } else output += java.lang.Character.toString(input[i]); } } catch (BadHanyuPinyinOutputFormatCombination e) { e.printStackTrace(); } return output; } /** * 获取汉字拼音首字母,英文字符不变 * @param chinese 汉字串 * @return 汉语拼音首字母 */ public static String getFirstSpell(String chinese) { StringBuffer pybf = new StringBuffer(); char[] arr = chinese.toCharArray(); HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat(); defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE); defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE); for (int i = 0; i < arr.length; i++) { if (arr[i] > 128) { try { String[] temp = PinyinHelper.toHanyuPinyinStringArray(arr[i], defaultFormat); if (temp != null) { pybf.append(temp[0].charAt(0)); } } catch (BadHanyuPinyinOutputFormatCombination e) { e.printStackTrace(); } } else { pybf.append(arr[i]); } } return pybf.toString().replaceAll("\\W", "").trim(); } /** * 获取汉字拼音,英文字符不变 * @param chinese 汉字串 * @return 汉语拼音 */ public static String getFullSpell(String chinese) { StringBuffer pybf = new StringBuffer(); char[] arr = chinese.toCharArray(); HanyuPinyinOutputFormat defaultFormat = new HanyuPinyinOutputFormat(); defaultFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE); defaultFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE); for (int i = 0; i < arr.length; i++) { if (arr[i] > 128) { try { pybf.append(PinyinHelper.toHanyuPinyinStringArray(arr[i], defaultFormat)[0]); } catch (BadHanyuPinyinOutputFormatCombination e) { e.printStackTrace(); } } else { pybf.append(arr[i]); } } return pybf.toString(); } public static void main(String[] as){ System.out.println(getPingYin("非诚勿扰")); } }
©️2020 CSDN 皮肤主题: 编程工作室 设计师:CSDN官方博客 返回首页