工作需求要讲汉字转换成拼音,自定义UDF函数
import net.sourceforge.pinyin4j.PinyinHelper;
import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import org.apache.hadoop.hive.ql.exec.Description;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import com.sun.tools.javadoc.Main;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 该方法主要实现将汉字转换成对应的拼音。
*/
@Description(name = "pinyin"
, value = "_FUNC_(string) - get pinyin by given chinese."
, extended = "Example:\n > select _FUNC_(string) from src;")
//方法名称:UDFChineseToPinYin
public class UDFChineseToPinYin extends UDF {
private Text result = new Text();
public UDFChineseToPinYin() {
}
public Text evaluate(Text chinese) {
if (chinese == null) {
return null;
}
result.set(ConvertToPinyin(chinese.toString()));
return result;
}
public String ConvertToPinyin(String name) {
HanyuPinyinOutputFormat pyFormat = new HanyuPinyinOutputFormat();
pyFormat.setCaseType(HanyuPinyinCaseType.LOWERCASE);
pyFormat.setToneType(HanyuPinyinToneType.WITHOUT_TONE);
pyFormat.setVCharType(HanyuPinyinVCharType.WITH_V);
String result = null;
try {
result = PinyinHelper.toHanyuPinyinString(name, pyFormat, "");
} catch (BadHanyuPinyinOutputFormatCombination e) {
return null;
}
return result;
}
//main方法里面测试,将汉语转换成拼音输出
public static void main(String[] args) {
UDFChineseToPinYin udfpy = new UDFChineseToPinYin();
System.out.println(udfpy.evaluate(new Text("你好,祖玛朗玛峰")));
System.out.println(udfpy.evaluate(new Text("你好,不着调,着急")));
}
}
//测试结果如下:很明显,当出现多音字时则会出现错误的情况。
nihao,zumalangmafeng
nihao,buzhediao,zheji