pinyin4j获取多音字首字母同时保留非中文字符

前情:获取中文的首字母,要求正确识别多音字(例:重庆,重启,重量,成长等),同时需要保留非中文字符

当前pinyin4j的最新版2.5.1里面不支持多音字的正确获取首字母(网上找的解决方案大多数也是当遇到多音字时只取第一个拼音),于是扩展了下它的部分源码,支持多音字的首字母获取。

要求项目中导入com.belerweb.pinyin4j.2.5.1包,然后将下面的类放入项目中即可使用

以下内容暂时还未经过大量数据测试,后续若发现问题会及时修改

以下表格为修改记录

修改时间修改内容
2019-05-28发布
2020-04-23修改部分获取首字母异常,加了py.length() > 0判断
2022-07-05支持classpath下自定义拼音扩展库

如下是重新定义的**PinyinHelper.toHanYuPinyinString()**方法,命名、使用方式与源码一致,使用时需注意正确地导入类名

multi_pinyin.txt是多音字库(pinyin4j源码包里有),可以自己改个名字以及存储路径来扩展里面的多音字,里面并不是全的,比如“重启”需要添加“重启 (chong2,qi3)”才能正确识别

import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import net.sourceforge.pinyin4j.multipinyin.Trie;
import org.springframework.core.io.ClassPathResource;
import org.springframework.util.StringUtils;

import java.io.BufferedInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

public class PinyinHelper {

	/**
     * 自定义拼音扩展库,从classpath下查找
     */
    private static final String MULTI_PINYIN_APPENDER = "multi_pinyin_appender.txt";

    public static String toHanYuPinyinString(String str, HanyuPinyinOutputFormat outputFormat, String separate, boolean retain) throws BadHanyuPinyinOutputFormatCombination {
        ChineseToPinyinResource resource = ChineseToPinyinResource.getInstance();
        StringBuilder resultPinyinStrBuf = new StringBuilder();
        char[] chars = str.toCharArray();
        for (int i = 0; i < chars.length; i++) {
            // 匹配到的最长的结果
            String result = null;
            char ch = chars[i];
            Trie currentTrie = resource.getUnicodeToHanyuPinyinTable();
            int success = i;
            int current = i;
            do {
                String hexStr = Integer.toHexString((int) ch).toUpperCase();
                currentTrie = currentTrie.get(hexStr);
                if (currentTrie != null) {
                    if (currentTrie.getPinyin() != null) {
                        result = currentTrie.getPinyin();
                        success = current;
                    }
                    currentTrie = currentTrie.getNextTire();
                } else {

                }
                current++;
                if (current < chars.length) {
                    ch = chars[current];
                } else {
                    break;
                }
            } while (currentTrie != null);

            // 如果在前缀树中没有匹配到,那么它就不能转换为拼音,直接输出或者去掉
            if (result == null) {
                if (retain) {
                    if (i != 0 && current != success && resultPinyinStrBuf.lastIndexOf(separate) != resultPinyinStrBuf.length() - 1) {
                        resultPinyinStrBuf.append(separate);
                    }
                    resultPinyinStrBuf.append(chars[i]);
                }
            } else {
                String[] pinyinStrArray = resource.parsePinyinString(result);
                if (pinyinStrArray != null) {
                    for (int j = 0; j < pinyinStrArray.length; j++) {
                        if (i != 0 && current != success && resultPinyinStrBuf.lastIndexOf(separate) != resultPinyinStrBuf.length() - 1) {
                            resultPinyinStrBuf.append(separate);
                        }
                        resultPinyinStrBuf.append(PinyinFormatter.formatHanyuPinyin(pinyinStrArray[j], outputFormat));
                        // 不是最后一个,(也不是拼音的最后一个,并且不是最后匹配成功的)
                        if (current < chars.length || (j < pinyinStrArray.length - 1 && i != success)) {
                            resultPinyinStrBuf.append(separate);
                        }
                        if (i == success) {
                            break;
                        }
                    }
                }
            }
            i = success;
        }
        return resultPinyinStrBuf.toString();
    }

    static class PinyinFormatter {

        static String formatHanyuPinyin(String pinyinStr, HanyuPinyinOutputFormat outputFormat)
                throws BadHanyuPinyinOutputFormatCombination {
            if ((HanyuPinyinToneType.WITH_TONE_MARK == outputFormat.getToneType())
                    && ((HanyuPinyinVCharType.WITH_V == outputFormat.getVCharType()) || (HanyuPinyinVCharType.WITH_U_AND_COLON == outputFormat
                    .getVCharType()))) {
                throw new BadHanyuPinyinOutputFormatCombination("tone marks cannot be added to v or u:");
            }

            if (HanyuPinyinToneType.WITHOUT_TONE == outputFormat.getToneType()) {
                pinyinStr = pinyinStr.replaceAll("[1-5]", "");
            } else if (HanyuPinyinToneType.WITH_TONE_MARK == outputFormat.getToneType()) {
                pinyinStr = pinyinStr.replaceAll("u:", "v");
                pinyinStr = convertToneNumber2ToneMark(pinyinStr);
            }

            if (HanyuPinyinVCharType.WITH_V == outputFormat.getVCharType()) {
                pinyinStr = pinyinStr.replaceAll("u:", "v");
            } else if (HanyuPinyinVCharType.WITH_U_UNICODE == outputFormat.getVCharType()) {
                pinyinStr = pinyinStr.replaceAll("u:", "ü");
            }

            if (HanyuPinyinCaseType.UPPERCASE == outputFormat.getCaseType()) {
                pinyinStr = pinyinStr.toUpperCase();
            }
            return pinyinStr;
        }

        /**
         * Convert tone numbers to tone marks using Unicode <br/><br/>
         *
         * <b>Algorithm for determining location of tone mark</b><br/>
         * <p>
         * A simple algorithm for determining the vowel on which the tone mark
         * appears is as follows:<br/>
         *
         * <ol>
         * <li>First, look for an "a" or an "e". If either vowel appears, it takes
         * the tone mark. There are no possible pinyin syllables that contain both
         * an "a" and an "e".
         *
         * <li>If there is no "a" or "e", look for an "ou". If "ou" appears, then
         * the "o" takes the tone mark.
         *
         * <li>If none of the above cases hold, then the last vowel in the syllable
         * takes the tone mark.
         *
         * </ol>
         *
         * @param pinyinStr the ascii represention with tone numbers
         * @return the unicode represention with tone marks
         */
        private static String convertToneNumber2ToneMark(final String pinyinStr) {
            String lowerCasePinyinStr = pinyinStr.toLowerCase();

            if (lowerCasePinyinStr.matches("[a-z]*[1-5]?")) {
                final char defautlCharValue = '$';
                final int defautlIndexValue = -1;

                char unmarkedVowel = defautlCharValue;
                int indexOfUnmarkedVowel = defautlIndexValue;

                final char charA = 'a';
                final char charE = 'e';
                final String ouStr = "ou";
                final String allUnmarkedVowelStr = "aeiouv";
                final String allMarkedVowelStr = "āáăàaēéĕèeīíĭìiōóŏòoūúŭùuǖǘǚǜü";

                if (lowerCasePinyinStr.matches("[a-z]*[1-5]")) {

                    int tuneNumber =
                            Character.getNumericValue(lowerCasePinyinStr.charAt(lowerCasePinyinStr.length() - 1));

                    int indexOfA = lowerCasePinyinStr.indexOf(charA);
                    int indexOfE = lowerCasePinyinStr.indexOf(charE);
                    int ouIndex = lowerCasePinyinStr.indexOf(ouStr);

                    if (-1 != indexOfA) {
                        indexOfUnmarkedVowel = indexOfA;
                        unmarkedVowel = charA;
                    } else if (-1 != indexOfE) {
                        indexOfUnmarkedVowel = indexOfE;
                        unmarkedVowel = charE;
                    } else if (-1 != ouIndex) {
                        indexOfUnmarkedVowel = ouIndex;
                        unmarkedVowel = ouStr.charAt(0);
                    } else {
                        for (int i = lowerCasePinyinStr.length() - 1; i >= 0; i--) {
                            if (String.valueOf(lowerCasePinyinStr.charAt(i)).matches(
                                    "[" + allUnmarkedVowelStr + "]")) {
                                indexOfUnmarkedVowel = i;
                                unmarkedVowel = lowerCasePinyinStr.charAt(i);
                                break;
                            }
                        }
                    }

                    if ((defautlCharValue != unmarkedVowel) && (defautlIndexValue != indexOfUnmarkedVowel)) {
                        int rowIndex = allUnmarkedVowelStr.indexOf(unmarkedVowel);
                        int columnIndex = tuneNumber - 1;

                        int vowelLocation = rowIndex * 5 + columnIndex;

                        char markedVowel = allMarkedVowelStr.charAt(vowelLocation);

                        return lowerCasePinyinStr.substring(0, indexOfUnmarkedVowel).replaceAll("v", "ü")
                                + markedVowel
                                + lowerCasePinyinStr.substring(indexOfUnmarkedVowel + 1,
                                lowerCasePinyinStr.length() - 1).replaceAll("v", "ü");

                    } else
                    // error happens in the procedure of locating vowel
                    {
                        return lowerCasePinyinStr;
                    }
                } else
                // input string has no any tune number
                {
                    // only replace v with ü (umlat) character
                    return lowerCasePinyinStr.replaceAll("v", "ü");
                }
            } else
            // bad format
            {
                return lowerCasePinyinStr;
            }
        }

    }


    static class ChineseToPinyinResource {

        /**
         * A hash table contains <Unicode, HanyuPinyin> pairs
         */
        private Trie unicodeToHanyuPinyinTable = null;

        /**
         * @param unicodeToHanyuPinyinTable The unicodeToHanyuPinyinTable to set.
         */
        private void setUnicodeToHanyuPinyinTable(Trie unicodeToHanyuPinyinTable) {
            this.unicodeToHanyuPinyinTable = unicodeToHanyuPinyinTable;
        }

        /**
         * @return Returns the unicodeToHanyuPinyinTable.
         */
        Trie getUnicodeToHanyuPinyinTable() {
            return unicodeToHanyuPinyinTable;
        }

        /**
         * Private constructor as part of the singleton pattern.
         */
        private ChineseToPinyinResource() {
            initializeResource();
        }

        /**
         * Initialize a hash-table contains <Unicode, HanyuPinyin> pairs
         */
        private void initializeResource() {
            try {
                final String resourceName = "/pinyindb/unicode_to_hanyu_pinyin.txt";
                final String resourceMultiName = "/pinyindb/multi_pinyin.txt";

                setUnicodeToHanyuPinyinTable(new Trie());
                getUnicodeToHanyuPinyinTable().load(ResourceHelper.getResourceInputStream(resourceName));

                getUnicodeToHanyuPinyinTable().loadMultiPinyin(ResourceHelper.getResourceInputStream(resourceMultiName));

 				// 新增classpath下拼音扩展库
                if (StringUtils.hasLength(MULTI_PINYIN_APPENDER)) {
                    ClassPathResource pathResource = new ClassPathResource(MULTI_PINYIN_APPENDER);
                    if (pathResource.exists()) {
                        getUnicodeToHanyuPinyinTable().loadMultiPinyin(pathResource.getInputStream());
                    }
                }

				// 原始拼音扩展库,仅支持绝对路径
                getUnicodeToHanyuPinyinTable().loadMultiPinyinExtend();

            } catch (FileNotFoundException ex) {
                ex.printStackTrace();
            } catch (IOException ex) {
                ex.printStackTrace();
            }
        }

        Trie getHanyuPinyinTrie(char ch) {

            String codepointHexStr = Integer.toHexString((int) ch).toUpperCase();

            // fetch from hashtable
            return getUnicodeToHanyuPinyinTable().get(codepointHexStr);
        }

        /**
         * Get the unformatted Hanyu Pinyin representations of the given Chinese
         * character in array format.
         *
         * @param ch given Chinese character in Unicode
         * @return The Hanyu Pinyin strings of the given Chinese character in array
         * format; return null if there is no corresponding Pinyin string.
         */
        String[] getHanyuPinyinStringArray(char ch) {
            String pinyinRecord = getHanyuPinyinRecordFromChar(ch);
            return parsePinyinString(pinyinRecord);
        }

        String[] parsePinyinString(String pinyinRecord) {

            if (null != pinyinRecord) {
                int indexOfLeftBracket = pinyinRecord.indexOf(Field.LEFT_BRACKET);
                int indexOfRightBracket = pinyinRecord.lastIndexOf(Field.RIGHT_BRACKET);

                String stripedString =
                        pinyinRecord.substring(indexOfLeftBracket + Field.LEFT_BRACKET.length(),
                                indexOfRightBracket);

                return stripedString.split(Field.COMMA);

            } else {
                // no record found or mal-formatted record
                return null;
            }
        }

        /**
         * @param record given record string of Hanyu Pinyin
         * @return return true if record is not null and record is not "none0" and
         * record is not mal-formatted, else return false
         */
        private boolean isValidRecord(String record) {
            final String noneStr = "(none0)";

            return (null != record) && !record.equals(noneStr) && record.startsWith(Field.LEFT_BRACKET)
                    && record.endsWith(Field.RIGHT_BRACKET);
        }

        /**
         * @param ch given Chinese character in Unicode
         * @return corresponding Hanyu Pinyin Record in Properties file; null if no
         * record found
         */
        private String getHanyuPinyinRecordFromChar(char ch) {
            // convert Chinese character to code point (integer)
            // please refer to http://www.unicode.org/glossary/#code_point
            // Another reference: http://en.wikipedia.org/wiki/Unicode
            int codePointOfChar = ch;

            String codepointHexStr = Integer.toHexString(codePointOfChar).toUpperCase();

            // fetch from hashtable
            Trie trie = getUnicodeToHanyuPinyinTable().get(codepointHexStr);
            String foundRecord = null;
            if (trie != null) {
                foundRecord = trie.getPinyin();
            }

            return isValidRecord(foundRecord) ? foundRecord : null;
        }

        /**
         * Singleton factory method.
         *
         * @return the one and only MySingleton.
         */
        static ChineseToPinyinResource getInstance() {
            return ChineseToPinyinResourceHolder.THE_INSTANCE;
        }

        /**
         * Singleton implementation helper.
         */
        private static class ChineseToPinyinResourceHolder {
            static final ChineseToPinyinResource THE_INSTANCE = new ChineseToPinyinResource();
        }

        /**
         * A class encloses common string constants used in Properties files
         *
         * @author Li Min (xmlerlimin@gmail.com)
         */
        class Field {
            static final String LEFT_BRACKET = "(";

            static final String RIGHT_BRACKET = ")";

            static final String COMMA = ",";
        }

    }


    static class ResourceHelper {

        /**
         * @param resourceName
         * @return resource (mainly file in file system or file in compressed
         * package) as BufferedInputStream
         */
        static BufferedInputStream getResourceInputStream(String resourceName) {
            return new BufferedInputStream(ResourceHelper.class.getResourceAsStream(resourceName));
        }
    }
}

下面是使用方式: 里面用到了google的guava包的部分内容

import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

import java.util.List;

/**
 * 拼音工具类
 */
public class PinyinUtil {

    private static HanyuPinyinOutputFormat outputFormat;
    private static final String SEPARATE = "#";

    static {
        outputFormat = new HanyuPinyinOutputFormat();
        outputFormat.setVCharType(HanyuPinyinVCharType.WITH_V);
        outputFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
    }
    
    /**
     * 获取文本的拼音
     *
     * @param str     需要转换拼音的文本
     * @param retain  true:保留中文以外的其他字符
     * @param initial true:只需要首字母
     * @return 拼音
     */
    public static String toPinYinString(String str, boolean retain, boolean initial) {
        StringBuilder sb = new StringBuilder();
        try {
            List<String> list = Lists.newArrayList();
            StringBuilder notChinese = new StringBuilder();
            for (int i = 0; i < str.length(); i++) {
                if (str.charAt(i) < 0x4E00 || str.charAt(i) > 0x9FA5) {
                    notChinese.append(str.charAt(i));
                    if (i == str.length() - 1) {
                        list.add(notChinese.toString());
                    }
                } else {
                    if (notChinese.length() > 0) {
                        list.add(notChinese.toString());
                        notChinese = new StringBuilder();
                    }
                }
            }
            String pinyin = PinyinHelper.toHanYuPinyinString(str, outputFormat, SEPARATE, retain);
            Splitter.on(SEPARATE).split(pinyin).forEach(py -> {
                if (list.contains(py)) {
                    sb.append(py);
                    return;
                }
                if (initial) {
                	if (py.length() > 0) {
                    	sb.append(py.charAt(0));
                    }
                } else {
                    sb.append(py);
                }
            });
        } catch (BadHanyuPinyinOutputFormatCombination e) {
            e.printStackTrace();
        }
        return sb.toString();
    }
    
}

下面是临时测试结果:

		String str = "成长,重启,重量,长大了,角色,角落,呼啦啦,1我2,3爱4,5你6";
        System.out.println(PinyinUtil.toPinYinString(str, true, true));
        // cz,cq,zl,zdl,js,jl,hll,1w2,3a4,5n6
        System.out.println(PinyinUtil.toPinYinString(str, false, true));
        // czcqzlzdljsjlhllwan
        System.out.println(PinyinUtil.toPinYinString(str, true, false));
        // chengzhang,chongqi,zhongliang,zhangdale,juese,jiaoluo,hulala,1wo2,3ai4,5ni6
        System.out.println(PinyinUtil.toPinYinString(str, false, false));
        // chengzhangchongqizhongliangzhangdalejuesejiaoluohulalawoaini
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 5
    评论
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值