package com.shenshen.a1; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStreamReader; import java.io.UnsupportedEncodingException; import java.util.HashMap; import java.util.Map; import java.util.logging.Logger; import org.apache.commons.lang.StringUtils; import net.sourceforge.pinyin4j.PinyinHelper; import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType; import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat; import net.sourceforge.pinyin4j.format.HanyuPinyinToneType; import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType; import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination; /** * Created by Administrator on 2018/6/12. */ public class PinyinUtils { public static void main (String args [] ){ try { String str = "长春市莘莘科技有限公司"; System.out.println(str+" pyf="+PinyinUtils.chineseToPinYinF(str));//全拼 System.out.println(str + " pys="+PinyinUtils.chineseToPinYinS(str).toUpperCase());//简拼 System.out.println(str + " pys="+PinyinUtils.chineseToPinYinS(str)); }catch(BadHanyuPinyinOutputFormatCombination e){ e.printStackTrace(); } } private static final Logger logger = Logger.getLogger("devLog"); //记录日志 public static Map<String,String> dictionary = new HashMap<String,String>(); //加载多音字词典 static { BufferedReader br = null; try { String path = ChineseToHanYuPYTest.class.getClassLoader().getResource("").getPath() + "com/shenshen/a1/duoyinzi_dic.txt"; File file = new File(path); br = new BufferedReader(new InputStreamReader(new FileInputStream(file),"UTF-8")); String line = null; while((line=br.readLine())!=null){ String[] arr = line.split("#"); if(StringUtils.isNotEmpty(arr[1])){ String[] sems = arr[1].split(" "); for (String sem : sems) { if(StringUtils.isNotEmpty(sem)){ dictionary.put(sem , arr[0]); } } } } } catch (UnsupportedEncodingException e) { e.printStackTrace(); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); }finally{ if(br!=null){ try { br.close(); } catch (IOException e) { e.printStackTrace(); } } } } public static String[] chineseToPinYin(char chineseCharacter) throws BadHanyuPinyinOutputFormatCombination{ HanyuPinyinOutputFormat outputFormat = new HanyuPinyinOutputFormat(); outputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE); outputFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE); outputFormat.setVCharType(HanyuPinyinVCharType.WITH_V); if(chineseCharacter>=33 && chineseCharacter<=125){ //ASCII >=33 ASCII<=125的直接返回 ,ASCII码表:http://www.asciitable.com/ return new String[]{String.valueOf(chineseCharacter)}; //chineseCharacter转成字符串 } return PinyinHelper.toHanyuPinyinStringArray(chineseCharacter, outputFormat); } /** * 获取汉字拼音的全拼 * @param chineseCharacter * @return * @throws BadHanyuPinyinOutputFormatCombination */ public static String chineseToPinYinF(String chineseCharacter) throws BadHanyuPinyinOutputFormatCombination{ if(StringUtils.isEmpty(chineseCharacter)){ return null; } char[] chs = chineseCharacter.toCharArray(); StringBuilder result = new StringBuilder(); for(int i=0;i<chs.length;i++){ String[] arr = chineseToPinYin(chs[i]); if(arr==null){ result.append(""); }else if(arr.length==1){ result.append(arr[0]); }else if(arr[0].equals(arr[1])){ result.append(arr[0]); }else{ String prim = chineseCharacter.substring(i, i+1); // System.out.println("prim="+prim+"**i="+i); String lst = null,rst = null; if(i<=chineseCharacter.length()-2){ rst = chineseCharacter.substring(i,i+2); } if(i>=1 && i+1<=chineseCharacter.length()){ lst = chineseCharacter.substring(i-1,i+1); } // System.out.println("lst="+lst+"**rst="+rst); String answer = null; for (String py : arr) { if(StringUtils.isEmpty(py)){ continue; } if((lst!=null && py.equals(dictionary.get(lst))) || (rst!=null && py.equals(dictionary.get(rst)))){ answer = py; // System.out.println("get it,answer="+answer+",i="+i+"**break"); break; } if(py.equals(dictionary.get(prim))){ answer = py; // System.out.println("get it,answer="+answer+",i="+i+"**prim="+prim); } } if(answer!=null){ result.append(answer); }else{ logger.warning("no answer ch="+chs[i]); } } } return result.toString().toLowerCase(); } public static String chineseToPinYinS(String chineseCharacter) throws BadHanyuPinyinOutputFormatCombination{ if(StringUtils.isEmpty(chineseCharacter)){ return null; } char[] chs = chineseCharacter.toCharArray(); StringBuilder result = new StringBuilder(); for(int i=0;i<chs.length;i++){ String[] arr = chineseToPinYin(chs[i]); if(arr==null){ result.append(""); }else if(arr.length==1){ result.append(arr[0].charAt(0)); }else if(arr[0].equals(arr[1])){ result.append(arr[0].charAt(0)); }else{ String prim = chineseCharacter.substring(i, i+1); // System.out.println("prim="+prim+"**i="+i); String lst = null,rst = null; if(i<=chineseCharacter.length()-2){ rst = chineseCharacter.substring(i,i+2); } if(i>=1 && i+1<=chineseCharacter.length()){ lst = chineseCharacter.substring(i-1,i+1); } // System.out.println("lst="+lst+"**rst="+rst); String answer = null; for (String py : arr) { if(StringUtils.isEmpty(py)){ continue; } if((lst!=null && py.equals(dictionary.get(lst))) || (rst!=null && py.equals(dictionary.get(rst)))){ answer = py; // System.out.println("get it,answer="+answer+",i="+i+"**break"); break; } if(py.equals(dictionary.get(prim))){ answer = py; // System.out.println("get it,answer="+answer+",i="+i+"**prim="+prim); } } if(answer!=null){ result.append(answer.charAt(0)); }else{ logger.warning("no answer ch="+chs[i]); } } } return result.toString().toLowerCase(); } }
汉字提取首字母(包括多音字处理)
最新推荐文章于 2023-08-16 15:20:09 发布