用JAVA实现汉字转拼音缩写（两种方式介绍）

最新推荐文章于 2024-07-18 05:00:00 发布

fjfdszj

最新推荐文章于 2024-07-18 05:00:00 发布

阅读量3k

点赞数

文章标签： java string exception table byte c

本文链接：https://blog.csdn.net/fjfdszj/article/details/4320262

版权

1.JAVA版汉字转拼音缩写程序摘自：http://www.blogjava.net/jiafang83/archive/2009/06/08/280651.html 最近正做一个根据汉字拼音缩写的查询，从网上搜了搜，觉得这种实现是最好的，但不知道原理是啥。 public class ChnToPinYin{ /** * 汉字转拼音缩写 * @param str //要转换的汉字字符串 * @return String //拼音缩写 */ public static String getPYString(String str) { String tempStr = ""; for(int i=0; i= 33 && (int)c <=126) {//字母和符号原样保留 tempStr += String.valueOf(c); } else {//累加拼音声母 tempStr += getPYChar( String.valueOf(c) ); } } return tempStr; } /** * 取单个字符的拼音声母 * @param c //要转换的单个汉字 * @return String 拼音声母 */ public static String getPYChar(String c) { byte[] array = new byte[2]; array = String.valueOf(c).getBytes(); int i = (short)(array[0] - '/0' + 256) * 256 + ((short)(array[1] - '/0' + 256)); if ( i < 0xB0A1) return "*"; if ( i < 0xB0C5) return "a"; if ( i < 0xB2C1) return "b"; if ( i < 0xB4EE) return "c"; if ( i < 0xB6EA) return "d"; if ( i < 0xB7A2) return "e"; if ( i < 0xB8C1) return "f"; if ( i < 0xB9FE) return "g"; if ( i < 0xBBF7) return "h"; if ( i < 0xBFA6) return "j"; if ( i < 0xC0AC) return "k"; if ( i < 0xC2E8) return "l"; if ( i < 0xC4C3) return "m"; if ( i < 0xC5B6) return "n"; if ( i < 0xC5BE) return "o"; if ( i < 0xC6DA) return "p"; if ( i < 0xC8BB) return "q"; if ( i < 0xC8F6) return "r"; if ( i < 0xCBFA) return "s"; if ( i < 0xCDDA) return "t"; if ( i < 0xCEF4) return "w"; if ( i < 0xD1B9) return "x"; if ( i < 0xD4D1) return "y"; if ( i < 0xD7FA) return "z"; return "*"; } public static void main(String []g){ System.out.println(getPYString("中国，,;$#@&avc")); } } 2. 摘自：http://etongg.javaeye.com/blog/342126 原理介绍基础知识 GB 2312标准共收录6763个汉字，其中一级汉字3755个，二级汉字3008个。分区表示 GB 2312中对所收汉字进行了“分区”处理，每区含有94个汉字/符号。这种表示方式也称为区位码。 1）01-09区为特殊符号。 2）16-55区为一级汉字，按拼音排序。 3）56-87区为二级汉字，按部首/笔画排序。 4）10-15区及88-94区则未有编码。举例来说，“啊”字是GB2312之中的第一个汉字，它的区位码就是1601。字节结构在使用GB2312的程序中，通常采用EUC储存方法，以便兼容于ASCII。浏览器编码表上的“GB2312”，通常都是指“EUC-CN”表示法。每个汉字及符号以两个字节来表示。第一个字节称为“高位字节”（也称“区字节）”，第二个字节称为“低位字节”（也称“位字节”）。 “高位字节”使用了0xA1-0xF7(把01-87区的区号加上0xA0)，“低位字节”使用了0xA1-0xFE(把01-94加上 0xA0)。由于一级汉字从16区起始，汉字区的“高位字节”的范围是0xB0-0xF7，“低位字节”的范围是0xA1-0xFE，占用的码位是 72*94=6768。其中有5个空位是D7FA-D7FE。例如“啊”字在大多数程序中，会以两个字节，0xB0（第一个字节） 0xA1（第二个字节）储存。区位码=区字节+位字节（与区位码对比：0xB0=0xA0+16,0xA1=0xA0+1）。 GB2312编码表 16 ０１２３４５６７８９０　啊阿埃挨哎唉哀皑癌１蔼矮艾碍爱隘鞍氨安俺２按暗岸胺案肮昂盎凹敖３熬翱袄傲奥懊澳芭捌扒４叭吧笆八疤巴拔跋靶把５耙坝霸罢爸白柏百摆佰６败拜稗斑班搬扳般颁板７版扮拌伴瓣半办绊邦帮８梆榜膀绑棒磅蚌镑傍谤９苞胞包褒剥 17 ０１２３４５６７８９０　薄雹保堡饱宝抱报暴１豹鲍爆杯碑悲卑北辈背２贝钡倍狈备惫焙被奔苯３本笨崩绷甭泵蹦迸逼鼻４比鄙笔彼碧蓖蔽毕毙毖５币庇痹闭敝弊必辟壁臂６避陛鞭边编贬扁便变卞７辨辩辫遍标彪膘表鳖憋８别瘪彬斌濒滨宾摈兵冰９柄丙秉饼炳 18 ０１２３４５６７８９０　病并玻菠播拨钵波博１勃搏铂箔伯帛舶脖膊渤２泊驳捕卜哺补埠不布步３簿部怖擦猜裁材才财睬４踩采彩菜蔡餐参蚕残惭５惨灿苍舱仓沧藏操糙槽６曹草厕策侧册测层蹭插７叉茬茶查碴搽察岔差诧８拆柴豺搀掺蝉馋谗缠铲９产阐颤昌猖根据上面的表述，我们可以把16-55区的一级汉字，取拼音首之母了。首先找出不同拼音首之母，且在码表中位置最靠前的汉字，计算出它们的编码值。（GB2312完整码表http://210.44.195.12/yyx/chinese/News/UploadFile/GB2312.htm）其他汉字在相同编码下，只需计算出其所在的区间位置。 1. 判断是否为英文之母 2. 如果是，直接返回英文之母 3. 取字符的编码值 4. 比较判断其编码值在码表中的位置。 5. 根据位置值，返回之母表中相应值。 // 字母Z使用了两个标签，这里有２７个值 // i, u, v都不做声母, 跟随前面的字母 private static char[] chartable = { '啊', '芭', '擦', '搭', '蛾', '发', '噶', '哈', '哈', '击', '喀', '垃', '妈', '拿', '哦', '啪', '期', '然', '撒', '塌', '塌', '塌', '挖', '昔', '压', '匝', '座' }; public static final char[] firstLetter = { 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' }; private static int[] table = new int[27]; // 初始化 static { for (int i = 0; i < 27; ++i) { table[i] = gbValue(chartable[i]); } } // 主函数,输入字符,得到他的声母, // 英文字母返回对应的大写字母 // 其他非简体汉字返回 '0' public static char char2Alpha(char ch) { if (ch >= 'a' && ch <= 'z') return (char) (ch - 'a' + 'A'); if (ch >= 'A' && ch <= 'Z') return ch; int gb = gbValue(ch); if (gb < table[0]) return '0'; int i; for (i = 0; i < 26; ++i) { if (match(i, gb)) break; } if (i >= 26) return '0'; else return firstLetter[i]; } // 根据一个包含汉字的字符串返回一个汉字拼音首字母的字符串 public static String string2Alpha(String SourceStr) { String Result = ""; int StrLength = SourceStr.length(); int i; try { for (i = 0; i < StrLength; i++) { Result += char2Alpha(SourceStr.charAt(i)); } } catch (Exception e) { Result = ""; } return Result; } // 获取一个字符串的拼音码大写 public static String getFirstWordFirstLetter(String SourceStr) { String Result = "*"; try { Result = String.valueOf(char2Alpha(SourceStr.charAt(0))); } catch (Exception e) { Result = "*"; } return Result; } private static boolean match(int i, int gb) { if (gb < table[i]) return false; int j = i + 1; // 字母Z使用了两个标签 while (j < 26 && (table[j] == table[i])) ++j; if (j == 26) return gb <= table[j]; else return gb < table[j]; } // 取出汉字的编码 private static int gbValue(char ch) { String str = new String(); str += ch; try { byte[] bytes = str.getBytes("GBK"); if (bytes.length < 2) return 0; return (bytes[0] << 8 & 0xff00) + (bytes[1] & 0xff); } catch (Exception e) { return 0; } }