使用Java写一个Hive的UDF将中文转为拼音【借助pinyin4j-2.5.1】_udf 中文字符串转为拼音首字母-CSDN博客

本文链接：https://blog.csdn.net/qq_41990268/article/details/129827317

使用Java写一个Hive的UDF将中文转为拼音【借助pinyin4j-2.5.1】

背景

数仓项目中，遇到一个古人的Oracle SQL，大体上是这么写的：

select to_char(rawtohex(nlssort(表.字段，'NLS_SORT=SCHINESE_PINYIN_M'）） as 排序字段 from dual

虽然搞不懂古人到底有多少种神奇的想法，但是用大数据技术取代传统的数据库开发已经是大势所趋，自然要想办法平替掉它。考虑到这货主要是做排序用的，简单试了下直接对中文字段在HQL做order by，效果不是很理想，故需要考虑写个Hive的UDF实现类似的功能。不一定要严格的一致，大概可以排序即可。

原理

Hive的UDF

参照：https://lizhiyong.blog.csdn.net/article/details/126186377

或者简单参照：https://lizhiyong.blog.csdn.net/article/details/129220107

套路还是比较简单，重写个evaluate方法打Jar包注册到Hive即可，简单功能，注重算法实现部分即可。

中文转拼音

别人已经写了一个jar包并且放在Maven仓库，就不必自己造轮子了。笔者使用pinyin4j来实现。

/**
 * This file is part of pinyin4j (http://sourceforge.net/projects/pinyin4j/) and distributed under
 * GNU GENERAL PUBLIC LICENSE (GPL).
 * <p>
 * pinyin4j is free software; you can redistribute it and/or modify it under the terms of the GNU
 * General Public License as published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 * <p>
 * pinyin4j is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 * <p>
 * You should have received a copy of the GNU General Public License along with pinyin4j.
 */

package net.sourceforge.pinyin4j;

import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import net.sourceforge.pinyin4j.multipinyin.Trie;

/**
 * A class provides several utility functions to convert Chinese characters
 * (both Simplified and Tranditional) into various Chinese Romanization
 * representations
 *
 * @author Li Min (xmlerlimin@gmail.com)
 */
public class PinyinHelper {

  private static final String[] ARR_EMPTY = {};
  private static final String EMPTY = "";

  /**
   * Get all unformmatted Hanyu Pinyin presentations of a single Chinese
   * character (both Simplified and Tranditional)
   * <p>
   * <p>
   * For example, <br/> If the input is '间', the return will be an array with
   * two Hanyu Pinyin strings: <br/> "jian1" <br/> "jian4" <br/> <br/> If the
   * input is '李', the return will be an array with single Hanyu Pinyin
   * string: <br/> "li3"
   * <p>
   * <p>
   * <b>Special Note</b>: If the return is "none0", that means the input
   * Chinese character exists in Unicode CJK talbe, however, it has no
   * pronounciation in Chinese
   *
   * @param ch the given Chinese character
   * @return a String array contains all unformmatted Hanyu Pinyin
   * presentations with tone numbers; null for non-Chinese character
   */
  static public String[] toHanyuPinyinStringArray(char ch) {
    return getUnformattedHanyuPinyinStringArray(ch);
  }

  
  /**
   * Get a string which all Chinese characters are replaced by corresponding
   * main (first) Hanyu Pinyin representation.
   * <p>
   * <p>
   * <b>Special Note</b>: If the return contains "none0", that means that
   * Chinese character is in Unicode CJK talbe, however, it has not
   * pronounciation in Chinese. <b> This interface will be removed in next
   * release. </b>
   *
   * @param str          A given string contains Chinese characters
   * @param outputFormat Describes the desired format of returned Hanyu Pinyin string
   * @param separate     The string is appended after a Chinese character (excluding
   *                     the last Chinese character at the end of sentence). <b>Note!
   *                     Separate will not appear after a non-Chinese character</b>
   * @param retain       Retain the characters that cannot be converted into pinyin characters
   * @return a String identical to the original one but all recognizable
   * Chinese characters are converted into main (first) Hanyu Pinyin
   * representation
   */
  static public String toHanYuPinyinString(String str, HanyuPinyinOutputFormat outputFormat,
      String separate, boolean retain) throws BadHanyuPinyinOutputFormatCombination {
    ChineseToPinyinResource resource = ChineseToPinyinResource.getInstance();
    StringBuilder resultPinyinStrBuf = new StringBuilder();

    char[] chars = str.toCharArray();

    for (int i = 0; i < chars.length; i++) {
      String result = null;//匹配到的最长的结果
      char ch = chars[i];
      Trie currentTrie = resource.getUnicodeToHanyuPinyinTable();
      int success = i;
      int current = i;
      do {
        String hexStr = Integer.toHexString((int) ch).toUpperCase();
        currentTrie = currentTrie.get(hexStr);
        if (currentTrie != null) {
          if (currentTrie.getPinyin() != null) {
            result = currentTrie.getPinyin();
            success = current;
          }
          currentTrie = currentTrie.getNextTire();
        }
        current++;
        if (current < chars.length)
          ch = chars[current];
        else
          break;
      } while (currentTrie != null);

      if (result == null) {//如果在前缀树中没有匹配到，那么它就不能转换为拼音，直接输出或者去掉
        if (retain) resultPinyinStrBuf.append(chars[i]);
      } else {
        String[] pinyinStrArray = resource.parsePinyinString(result);
        if (pinyinStrArray != null) {
          for (int j = 0; j < pinyinStrArray.length; j++) {
            resultPinyinStrBuf.append(PinyinFormatter.formatHanyuPinyin(pinyinStrArray[j],
                outputFormat));
            if (current < chars.length || (j < pinyinStrArray.length - 1 && i != success)) {//不是最后一个,(也不是拼音的最后一个,并且不是最后匹配成功的)
              resultPinyinStrBuf.append(separate);
            }
            if (i == success) break;
          }
        }
      }
      i = success;
    }

    return resultPinyinStrBuf.toString();
  }

  // ! Hidden constructor
  private PinyinHelper() {}
}

由于需要的是一对一的结果，所以有用的必然是这个方法。

/**
 * This file is part of pinyin4j (http://sourceforge.net/projects/pinyin4j/) and distributed under
 * GNU GENERAL PUBLIC LICENSE (GPL).
 * <p/>
 * pinyin4j is free software; you can redistribute it and/or modify it under the terms of the GNU
 * General Public License as published by the Free Software Foundation; either version 2 of the
 * License, or (at your option) any later version.
 * <p/>
 * pinyin4j is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
 * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 * <p/>
 * You should have received a copy of the GNU General Public License along with pinyin4j.
 */

/**
 *
 */
package net.sourceforge.pinyin4j;

import net.sourceforge.pinyin4j.multipinyin.Trie;

import java.io.FileNotFoundException;
import java.io.IOException;

/**
 * Manage all external resources required in PinyinHelper class.
 *
 * @author Li Min (xmlerlimin@gmail.com)
 */
class ChineseToPinyinResource {
  /**
   * A hash table contains <Unicode, HanyuPinyin> pairs
   */
  private Trie unicodeToHanyuPinyinTable = null;


  /**
   * @return Returns the unicodeToHanyuPinyinTable.
   */
  Trie getUnicodeToHanyuPinyinTable() {
    return unicodeToHanyuPinyinTable;
  }


  /**
   * Initialize a hash-table contains <Unicode, HanyuPinyin> pairs
   */
  private void initializeResource() {
    try {
      final String resourceName = "/pinyindb/unicode_to_hanyu_pinyin.txt";
      final String resourceMultiName = "/pinyindb/multi_pinyin.txt";

      setUnicodeToHanyuPinyinTable(new Trie());
      getUnicodeToHanyuPinyinTable().load(ResourceHelper.getResourceInputStream(resourceName));

      getUnicodeToHanyuPinyinTable().loadMultiPinyin(
          ResourceHelper.getResourceInputStream(resourceMultiName));

      getUnicodeToHanyuPinyinTable().loadMultiPinyinExtend();

    } catch (FileNotFoundException ex) {
      ex.printStackTrace();
    } catch (IOException ex) {
      ex.printStackTrace();
    }
  }

  Trie getHanyuPinyinTrie(char ch) {

    String codepointHexStr = Integer.toHexString((int) ch).toUpperCase();

    // fetch from hashtable
    return getUnicodeToHanyuPinyinTable().get(codepointHexStr);
  }


}

从这里可以看到做初始化时用到了2个txt的资源文件，其中/pinyindb/multi_pinyin.txt存放着词组：

在这里插入图片描述

而 /pinyindb/unicode_to_hanyu_pinyin.txt存放着2w多个常用汉字的hex及拼音的对应关系：

在这里插入图片描述

所以可以判断出pinyin4j的原理：当匹配到词组时就可以使用词组的拼音。而匹配不到时就会按照单字的方式去查对应关系。

由于年代久远，词组可能不全，或者单字的拼音写错了。。。多音字的话，如果不在词组字典里就可能搞错。。。必要的时候就需要手动修改这2个字典文件了。。。

虽然Hive的UDF也是可以连外网的，就像笔者写Flink有时候也会给map算子调用baidu或者gaode的API把经纬度换算成地标。。。但是只做个简单排序，貌似不是很有必要接chat gpt或者别的什么API，凑合着用。。。

实现

pom.xml

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <parent>
        <artifactId>zhiyong_study</artifactId>
        <groupId>com.zhiyong</groupId>
        <version>1.0.0</version>
    </parent>
    <modelVersion>4.0.0</modelVersion>

    <artifactId>hive_study</artifactId>

    <!-- 指定仓库位置，依次为aliyun、cloudera、apache仓库 -->
    <repositories>
        <repository>
            <id>aliyun</id>
            <url>http://maven.aliyun.com/nexus/content/groups/public/</url>
        </repository>
        <repository>
            <id>cloudera</id>
            <url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
        </repository>
        <repository>
            <id>apache.snapshots</id>
            <name>Apache Development Snapshot Repository</name>
            <url>https://repository.apache.org/content/repositories/snapshots/</url>
        </repository>
    </repositories>

    <properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <hive-exec.version>3.1.2</hive-exec.version>
        <hive-jdbc.version>3.1.2</hive-jdbc.version>
        <hive-metastore.version>3.1.2</hive-metastore.version>
        <hive-common.version>3.1.2</hive-common.version>
        <hive-service.version>3.1.2</hive-service.version>
        <lombok-version>1.18.24</lombok-version>
        <encoding>UTF-8</encoding>
    </properties>

    <dependencies>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-exec</artifactId>
            <version>${hive-exec.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>org.glassfish</groupId>
                    <artifactId>javax.el</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-jdbc</artifactId>
            <version>${hive-jdbc.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>org.glassfish</groupId>
                    <artifactId>javax.el</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-metastore</artifactId>
            <version>${hive-metastore.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>org.glassfish</groupId>
                    <artifactId>javax.el</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-common</artifactId>
            <version>${hive-common.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>org.glassfish</groupId>
                    <artifactId>javax.el</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
        <dependency>
            <groupId>org.apache.hive</groupId>
            <artifactId>hive-service</artifactId>
            <version>${hive-service.version}</version>
            <exclusions>
                <exclusion>
                    <groupId>org.glassfish</groupId>
                    <artifactId>javax.el</artifactId>
                </exclusion>
            </exclusions>
        </dependency>
<!--        <dependency>-->
<!--            <groupId>org.projectlombok</groupId>-->
<!--            <artifactId>lombok</artifactId>-->
<!--            <version>${lombok-version}</version>-->
<!--            <scope>provided</scope>-->
<!--        </dependency>-->
        <dependency>
            <groupId>org.projectlombok</groupId>
            <artifactId>lombok</artifactId>
            <version>1.18.24</version>
            <scope>provided</scope>
        </dependency>

        <dependency>
            <groupId>com.belerweb</groupId>
            <artifactId>pinyin4j</artifactId>
            <version>2.5.1</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-compiler-plugin</artifactId>
                <version>3.2</version>
                <configuration>
                    <source>1.8</source>
                    <target>1.8</target>
                    <encoding>UTF-8</encoding>
                </configuration>
            </plugin>
        </plugins>
    </build>

</project>

排除一些有问题的依赖，并依赖pinyin4j即可。

在这个Maven仓库：https://mvnrepository.com/artifact/com.belerweb/pinyin4j

在这里插入图片描述

可以看到这个Jar包很古老了，笔者上学的时候就有了。。。

Java

简单验证下效果：

package com.zhiyong;

import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.apache.hadoop.hive.ql.exec.UDF;

/**
 * @program: zhiyong_study
 * @description: 用Java调用pingyin4j写个中文转拼音的udf
 * @author: zhiyong
 * @create: 2023-03-28 21:16
 **/
public class PingYinUdfDemo {
    public static void main(String[] args) {
        String[] inputs = new String[10];
        inputs[0] = "数码宝贝";//
        inputs[1] = "饕餮";
        inputs[2] = "机械暴龙兽";
        inputs[3] = "战斗暴龙兽";
        inputs[4] = "省事";
        inputs[5] = "省悟";
        inputs[6] = "差不多";
        inputs[7] = "差旅";
        inputs[8] = "重点";
        inputs[9] = "重启";

        PingyingUdf pingyingUdf = new PingyingUdf();

        for (int i = 0; i < inputs.length; i++) {
            String result = pingyingUdf.evaluate(inputs[i]);
            System.out.println("input + result = " + inputs[i] + "/" + result);
        }
/**
 * input + result = 数码宝贝/shu#ma#baobei
 * input + result = 饕餮/tao#tie
 * input + result = 机械暴龙兽/ji#xie#bao#longshou
 * input + result = 战斗暴龙兽/zhan#dou#bao#longshou
 * input + result = 省事/shengshi
 * input + result = 省悟/xing#wu
 * input + result = 差不多/cha#bu#duo
 * input + result = 差旅/chalu:
 * input + result = 重点/zhongdian
 * input + result = 重启/zhongqi
 *
 * Process finished with exit code 0
 * 问题：多音字不准
 */
    }
}

class PingyingUdf extends UDF {
    public String evaluate(String input) {
        HanyuPinyinOutputFormat format = new HanyuPinyinOutputFormat();
        format.setCaseType(HanyuPinyinCaseType.LOWERCASE);//设置为小写
        //format.setToneType(HanyuPinyinToneType.WITHOUT_TONE);//设置为不区分音调
        //format.setToneType(HanyuPinyinToneType.WITH_TONE_MARK);//不管用，输出空白
        format.setToneType(HanyuPinyinToneType.WITH_TONE_NUMBER);
        //format.setVCharType(HanyuPinyinVCharType.WITH_V);//使用v代替u

        String result = "";

        try {
            result = PinyinHelper.toHanYuPinyinString(input, format, "#", true);
        } catch (BadHanyuPinyinOutputFormatCombination e) {
            //e.printStackTrace();
            result = "Error";
        }

        return result;


    }
}

效果

input + result = 数码宝贝/shu4#ma3#bao3bei4
input + result = 饕餮/tao1#tie4
input + result = 机械暴龙兽/ji1#xie4#bao4#long2shou4
input + result = 战斗暴龙兽/zhan4#dou4#bao4#long2shou4
input + result = 省事/sheng3shi4
input + result = 省悟/xing3#wu4
input + result = 差不多/cha4#bu4#duo1
input + result = 差旅/cha1lu:3
input + result = 重点/zhong4dian3
input + result = 重启/zhong4qi3

Process finished with exit code 0