汉字转拼音问题
这是一个非常经典的问题,同时,也是一个很难解决的问题。经典是因为,这个问题非常常见,而且有很多解决方案。
难以解决是因为,存在多音字。可以发现很多股票软件都有这问题,比如zgyh 出来的不是中国银行,而是找不到,
这是因为,中国银行的 “行” 查出来读的是 "xing" 。我篇文章我只是提供了一个原始的转换方法,就是查表。
和你以前看过的算法可能不一样的是,我加了一个二分查找,我看到的一个开源的解决方案是顺序查找的。
关于,多音字问题,我只有一个小规模的多音字表,从所有股票名称里面纠正过来的。解决方案是这样的:在查找拼音的时候,
查找一下这个字是否在多音字表里面,如果是,那么查找前后组成的词,来决定读什么。如果有谁有完整的多音字表,或者有更好的
解决方案,请给我留言。我一定拜谢。
![](https://i-blog.csdnimg.cn/blog_migrate/8f900a89c6347c561fdf2122f13be562.gif)
![ExpandedBlockStart.gif](https://i-blog.csdnimg.cn/blog_migrate/961ddebeb323a10fe0623af514929fc1.gif)
<?
php
/* *
* 初始化
*
* @param string $file 拼音代码对应表
* @return array 把这个对应表映射成array的格式
*/
function pinyin_init( $file )
{
$pinyin = file ( $file );
if ( empty ( $pinyin )) {
throw new Exception ( " load pinyin table error. " );
}
foreach ( $pinyin as $k => $item )
{
$item = explode ( " \t " , $item );
$item [ 0 ] = trim ( $item [ 0 ]);
$item [ 1 ] = (int) trim ( $item [ 1 ]);
$pinyin [ $k ] = $item ;
}
$pinyin [] = array ( '' , - 10248 ); // 最后一个拼音的妈是 -10247
return $pinyin ;
}
/* *
* 查找一个字符串中,中文字符部分的拼音
*
* @param array $pingyin
* @param string $str 字符串
* @param string $in_encode 字符串的编码
* @return array
*/
function pingyin_get( $pingyin , $str , $in_encode = ' gb2312 ' )
{
if ( strtolower ( $in_encode ) != ' gb2312 ' ) {
$str = iconv ( $in_encode , ' gb2312 ' , $str );
if ( ! $str ) {
throw new Exception ( ' invalid input string. ' );
}
}
$pinyin_arr = array ();
$len = strlen ( $str );
for ( $i = 0 ; $i < $len ; $i ++ )
{
if ( $i == $len - 1 ) break ;
if (( $code = pinyin_getcode_( $str [ $i ] , $str [ $i + 1 ])) != false ) {
$i ++ ;
if (( $py = pinyin_find_( $pingyin , $code )) !== false ) {
$pinyin_arr [ substr ( $str , $i - 1 , 2 )] = $py ;
}
}
}
return $pinyin_arr ;
}
/* *
* 内部函数,通过拼音码查找汉字
*
* @param array $pingyin
* @param int $code
* @return string/false
*/
function pinyin_find_( $pingyin , $code )
{
$start = 0 ;
$count = count ( $pingyin ) - 1 ;
$end = $count ;
if ( $code < $pingyin [ 0 ][ 1 ] || $code >= $pingyin [ $end ][ 1 ]) { // no found.
return false ;
}
while ( $start <= $end )
{
$mid = (int)( $start + ( $end - $start ) / 2 );
if ( $mid == $count ) { // 正常情况下,应该是永远都不会执行到这里,但是为了逻辑上清晰点,还是决定写上
return $pingyin [ $mid - 1 ][ 0 ];
} else if ( $code >= $pingyin [ $mid ][ 1 ] && $code < $pingyin [ $mid + 1 ][ 1 ]) {
return $pingyin [ $mid ][ 0 ];
} else if ( $code >= $pingyin [ $mid + 1 ][ 1 ]) {
$start = $mid + 1 ;
} else {
$end = $mid - 1 ;
}
}
return false ;
}
/* *
* 内部函数,通过汉字的第一个字节 和 第二个字节,得到查找的code
*
* @param char $ch1
* @param char $ch2
* @return int
*/
function pinyin_getcode_( $ch1 , $ch2 )
{
$num1 = ord ( $ch1 );
$num2 = ord ( $ch2 );
if ( $num1 > 159 && $num1 < 248 ) {
return $num1 * 256 + $num2 - 65536 ;
}
return false ;
}
/* *
* 基本配置信息
*/
define ( " CURRENT_PATH " , dirname ( __FILE__ ));
$pingin_file = CURRENT_PATH . " /gb-pinyin.table " ;
$pinyin = pinyin_init( $pingin_file );
$py = pingyin_get( $pinyin , " 中国银行 " , ' utf-8 ' );
echo iconv ( ' gb2312 ' , ' utf-8 ' , print_r ( $py , true ));
echo implode ( " ' " , $py );
?>
/* *
* 初始化
*
* @param string $file 拼音代码对应表
* @return array 把这个对应表映射成array的格式
*/
function pinyin_init( $file )
{
$pinyin = file ( $file );
if ( empty ( $pinyin )) {
throw new Exception ( " load pinyin table error. " );
}
foreach ( $pinyin as $k => $item )
{
$item = explode ( " \t " , $item );
$item [ 0 ] = trim ( $item [ 0 ]);
$item [ 1 ] = (int) trim ( $item [ 1 ]);
$pinyin [ $k ] = $item ;
}
$pinyin [] = array ( '' , - 10248 ); // 最后一个拼音的妈是 -10247
return $pinyin ;
}
/* *
* 查找一个字符串中,中文字符部分的拼音
*
* @param array $pingyin
* @param string $str 字符串
* @param string $in_encode 字符串的编码
* @return array
*/
function pingyin_get( $pingyin , $str , $in_encode = ' gb2312 ' )
{
if ( strtolower ( $in_encode ) != ' gb2312 ' ) {
$str = iconv ( $in_encode , ' gb2312 ' , $str );
if ( ! $str ) {
throw new Exception ( ' invalid input string. ' );
}
}
$pinyin_arr = array ();
$len = strlen ( $str );
for ( $i = 0 ; $i < $len ; $i ++ )
{
if ( $i == $len - 1 ) break ;
if (( $code = pinyin_getcode_( $str [ $i ] , $str [ $i + 1 ])) != false ) {
$i ++ ;
if (( $py = pinyin_find_( $pingyin , $code )) !== false ) {
$pinyin_arr [ substr ( $str , $i - 1 , 2 )] = $py ;
}
}
}
return $pinyin_arr ;
}
/* *
* 内部函数,通过拼音码查找汉字
*
* @param array $pingyin
* @param int $code
* @return string/false
*/
function pinyin_find_( $pingyin , $code )
{
$start = 0 ;
$count = count ( $pingyin ) - 1 ;
$end = $count ;
if ( $code < $pingyin [ 0 ][ 1 ] || $code >= $pingyin [ $end ][ 1 ]) { // no found.
return false ;
}
while ( $start <= $end )
{
$mid = (int)( $start + ( $end - $start ) / 2 );
if ( $mid == $count ) { // 正常情况下,应该是永远都不会执行到这里,但是为了逻辑上清晰点,还是决定写上
return $pingyin [ $mid - 1 ][ 0 ];
} else if ( $code >= $pingyin [ $mid ][ 1 ] && $code < $pingyin [ $mid + 1 ][ 1 ]) {
return $pingyin [ $mid ][ 0 ];
} else if ( $code >= $pingyin [ $mid + 1 ][ 1 ]) {
$start = $mid + 1 ;
} else {
$end = $mid - 1 ;
}
}
return false ;
}
/* *
* 内部函数,通过汉字的第一个字节 和 第二个字节,得到查找的code
*
* @param char $ch1
* @param char $ch2
* @return int
*/
function pinyin_getcode_( $ch1 , $ch2 )
{
$num1 = ord ( $ch1 );
$num2 = ord ( $ch2 );
if ( $num1 > 159 && $num1 < 248 ) {
return $num1 * 256 + $num2 - 65536 ;
}
return false ;
}
/* *
* 基本配置信息
*/
define ( " CURRENT_PATH " , dirname ( __FILE__ ));
$pingin_file = CURRENT_PATH . " /gb-pinyin.table " ;
$pinyin = pinyin_init( $pingin_file );
$py = pingyin_get( $pinyin , " 中国银行 " , ' utf-8 ' );
echo iconv ( ' gb2312 ' , ' utf-8 ' , print_r ( $py , true ));
echo implode ( " ' " , $py );
?>
拼音表如下:
![](https://i-blog.csdnimg.cn/blog_migrate/8f900a89c6347c561fdf2122f13be562.gif)
![ExpandedBlockStart.gif](https://i-blog.csdnimg.cn/blog_migrate/961ddebeb323a10fe0623af514929fc1.gif)
a
-
20319
ai - 20317
an - 20304
ang - 20295
ao - 20292
ba - 20283
bai - 20265
ban - 20257
bang - 20242
bao - 20230
bei - 20051
ben - 20036
beng - 20032
bi - 20026
bian - 20002
biao - 19990
bie - 19986
bin - 19982
bing - 19976
bo - 19805
bu - 19784
ca - 19775
cai - 19774
can - 19763
cang - 19756
cao - 19751
ce - 19746
ceng - 19741
cha - 19739
chai - 19728
chan - 19725
chang - 19715
chao - 19540
che - 19531
chen - 19525
cheng - 19515
chi - 19500
chong - 19484
chou - 19479
chu - 19467
chuai - 19289
chuan - 19288
chuang - 19281
chui - 19275
chun - 19270
chuo - 19263
ci - 19261
cong - 19249
cou - 19243
cu - 19242
cuan - 19238
cui - 19235
cun - 19227
cuo - 19224
da - 19218
dai - 19212
dan - 19038
dang - 19023
dao - 19018
de - 19006
deng - 19003
di - 18996
dian - 18977
diao - 18961
die - 18952
ding - 18783
diu - 18774
dong - 18773
dou - 18763
du - 18756
duan - 18741
dui - 18735
dun - 18731
duo - 18722
e - 18710
en - 18697
er - 18696
fa - 18526
fan - 18518
fang - 18501
fei - 18490
fen - 18478
feng - 18463
fo - 18448
fou - 18447
fu - 18446
ga - 18239
gai - 18237
gan - 18231
gang - 18220
gao - 18211
ge - 18201
gei - 18184
gen - 18183
geng - 18181
gong - 18012
gou - 17997
gu - 17988
gua - 17970
guai - 17964
guan - 17961
guang - 17950
gui - 17947
gun - 17931
guo - 17928
ha - 17922
hai - 17759
han - 17752
hang - 17733
hao - 17730
he - 17721
hei - 17703
hen - 17701
heng - 17697
hong - 17692
hou - 17683
hu - 17676
hua - 17496
huai - 17487
huan - 17482
huang - 17468
hui - 17454
hun - 17433
huo - 17427
ji - 17417
jia - 17202
jian - 17185
jiang - 16983
jiao - 16970
jie - 16942
jin - 16915
jing - 16733
jiong - 16708
jiu - 16706
ju - 16689
juan - 16664
jue - 16657
jun - 16647
ka - 16474
kai - 16470
kan - 16465
kang - 16459
kao - 16452
ke - 16448
ken - 16433
keng - 16429
kong - 16427
kou - 16423
ku - 16419
kua - 16412
kuai - 16407
kuan - 16403
kuang - 16401
kui - 16393
kun - 16220
kuo - 16216
la - 16212
lai - 16205
lan - 16202
lang - 16187
lao - 16180
le - 16171
lei - 16169
leng - 16158
li - 16155
lia - 15959
lian - 15958
liang - 15944
liao - 15933
lie - 15920
lin - 15915
ling - 15903
liu - 15889
long - 15878
lou - 15707
lu - 15701
lv - 15681
luan - 15667
lue - 15661
lun - 15659
luo - 15652
ma - 15640
mai - 15631
man - 15625
mang - 15454
mao - 15448
me - 15436
mei - 15435
men - 15419
meng - 15416
mi - 15408
mian - 15394
miao - 15385
mie - 15377
min - 15375
ming - 15369
miu - 15363
mo - 15362
mou - 15183
mu - 15180
na - 15165
nai - 15158
nan - 15153
nang - 15150
nao - 15149
ne - 15144
nei - 15143
nen - 15141
neng - 15140
ni - 15139
nian - 15128
niang - 15121
niao - 15119
nie - 15117
nin - 15110
ning - 15109
niu - 14941
nong - 14937
nu - 14933
nv - 14930
nuan - 14929
nue - 14928
nuo - 14926
o - 14922
ou - 14921
pa - 14914
pai - 14908
pan - 14902
pang - 14894
pao - 14889
pei - 14882
pen - 14873
peng - 14871
pi - 14857
pian - 14678
piao - 14674
pie - 14670
pin - 14668
ping - 14663
po - 14654
pu - 14645
qi - 14630
qia - 14594
qian - 14429
qiang - 14407
qiao - 14399
qie - 14384
qin - 14379
qing - 14368
qiong - 14355
qiu - 14353
qu - 14345
quan - 14170
que - 14159
qun - 14151
ran - 14149
rang - 14145
rao - 14140
re - 14137
ren - 14135
reng - 14125
ri - 14123
rong - 14122
rou - 14112
ru - 14109
ruan - 14099
rui - 14097
run - 14094
ruo - 14092
sa - 14090
sai - 14087
san - 14083
sang - 13917
sao - 13914
se - 13910
sen - 13907
seng - 13906
sha - 13905
shai - 13896
shan - 13894
shang - 13878
shao - 13870
she - 13859
shen - 13847
sheng - 13831
shi - 13658
shou - 13611
shu - 13601
shua - 13406
shuai - 13404
shuan - 13400
shuang - 13398
shui - 13395
shun - 13391
shuo - 13387
si - 13383
song - 13367
sou - 13359
su - 13356
suan - 13343
sui - 13340
sun - 13329
suo - 13326
ta - 13318
tai - 13147
tan - 13138
tang - 13120
tao - 13107
te - 13096
teng - 13095
ti - 13091
tian - 13076
tiao - 13068
tie - 13063
ting - 13060
tong - 12888
tou - 12875
tu - 12871
tuan - 12860
tui - 12858
tun - 12852
tuo - 12849
wa - 12838
wai - 12831
wan - 12829
wang - 12812
wei - 12802
wen - 12607
weng - 12597
wo - 12594
wu - 12585
xi - 12556
xia - 12359
xian - 12346
xiang - 12320
xiao - 12300
xie - 12120
xin - 12099
xing - 12089
xiong - 12074
xiu - 12067
xu - 12058
xuan - 12039
xue - 11867
xun - 11861
ya - 11847
yan - 11831
yang - 11798
yao - 11781
ye - 11604
yi - 11589
yin - 11536
ying - 11358
yo - 11340
yong - 11339
you - 11324
yu - 11303
yuan - 11097
yue - 11077
yun - 11067
za - 11055
zai - 11052
zan - 11045
zang - 11041
zao - 11038
ze - 11024
zei - 11020
zen - 11019
zeng - 11018
zha - 11014
zhai - 10838
zhan - 10832
zhang - 10815
zhao - 10800
zhe - 10790
zhen - 10780
zheng - 10764
zhi - 10587
zhong - 10544
zhou - 10533
zhu - 10519
zhua - 10331
zhuai - 10329
zhuan - 10328
zhuang - 10322
zhui - 10315
zhun - 10309
zhuo - 10307
zi - 10296
zong - 10281
zou - 10274
zu - 10270
zuan - 10262
zui - 10260
zun - 10256
zuo - 10254
ai - 20317
an - 20304
ang - 20295
ao - 20292
ba - 20283
bai - 20265
ban - 20257
bang - 20242
bao - 20230
bei - 20051
ben - 20036
beng - 20032
bi - 20026
bian - 20002
biao - 19990
bie - 19986
bin - 19982
bing - 19976
bo - 19805
bu - 19784
ca - 19775
cai - 19774
can - 19763
cang - 19756
cao - 19751
ce - 19746
ceng - 19741
cha - 19739
chai - 19728
chan - 19725
chang - 19715
chao - 19540
che - 19531
chen - 19525
cheng - 19515
chi - 19500
chong - 19484
chou - 19479
chu - 19467
chuai - 19289
chuan - 19288
chuang - 19281
chui - 19275
chun - 19270
chuo - 19263
ci - 19261
cong - 19249
cou - 19243
cu - 19242
cuan - 19238
cui - 19235
cun - 19227
cuo - 19224
da - 19218
dai - 19212
dan - 19038
dang - 19023
dao - 19018
de - 19006
deng - 19003
di - 18996
dian - 18977
diao - 18961
die - 18952
ding - 18783
diu - 18774
dong - 18773
dou - 18763
du - 18756
duan - 18741
dui - 18735
dun - 18731
duo - 18722
e - 18710
en - 18697
er - 18696
fa - 18526
fan - 18518
fang - 18501
fei - 18490
fen - 18478
feng - 18463
fo - 18448
fou - 18447
fu - 18446
ga - 18239
gai - 18237
gan - 18231
gang - 18220
gao - 18211
ge - 18201
gei - 18184
gen - 18183
geng - 18181
gong - 18012
gou - 17997
gu - 17988
gua - 17970
guai - 17964
guan - 17961
guang - 17950
gui - 17947
gun - 17931
guo - 17928
ha - 17922
hai - 17759
han - 17752
hang - 17733
hao - 17730
he - 17721
hei - 17703
hen - 17701
heng - 17697
hong - 17692
hou - 17683
hu - 17676
hua - 17496
huai - 17487
huan - 17482
huang - 17468
hui - 17454
hun - 17433
huo - 17427
ji - 17417
jia - 17202
jian - 17185
jiang - 16983
jiao - 16970
jie - 16942
jin - 16915
jing - 16733
jiong - 16708
jiu - 16706
ju - 16689
juan - 16664
jue - 16657
jun - 16647
ka - 16474
kai - 16470
kan - 16465
kang - 16459
kao - 16452
ke - 16448
ken - 16433
keng - 16429
kong - 16427
kou - 16423
ku - 16419
kua - 16412
kuai - 16407
kuan - 16403
kuang - 16401
kui - 16393
kun - 16220
kuo - 16216
la - 16212
lai - 16205
lan - 16202
lang - 16187
lao - 16180
le - 16171
lei - 16169
leng - 16158
li - 16155
lia - 15959
lian - 15958
liang - 15944
liao - 15933
lie - 15920
lin - 15915
ling - 15903
liu - 15889
long - 15878
lou - 15707
lu - 15701
lv - 15681
luan - 15667
lue - 15661
lun - 15659
luo - 15652
ma - 15640
mai - 15631
man - 15625
mang - 15454
mao - 15448
me - 15436
mei - 15435
men - 15419
meng - 15416
mi - 15408
mian - 15394
miao - 15385
mie - 15377
min - 15375
ming - 15369
miu - 15363
mo - 15362
mou - 15183
mu - 15180
na - 15165
nai - 15158
nan - 15153
nang - 15150
nao - 15149
ne - 15144
nei - 15143
nen - 15141
neng - 15140
ni - 15139
nian - 15128
niang - 15121
niao - 15119
nie - 15117
nin - 15110
ning - 15109
niu - 14941
nong - 14937
nu - 14933
nv - 14930
nuan - 14929
nue - 14928
nuo - 14926
o - 14922
ou - 14921
pa - 14914
pai - 14908
pan - 14902
pang - 14894
pao - 14889
pei - 14882
pen - 14873
peng - 14871
pi - 14857
pian - 14678
piao - 14674
pie - 14670
pin - 14668
ping - 14663
po - 14654
pu - 14645
qi - 14630
qia - 14594
qian - 14429
qiang - 14407
qiao - 14399
qie - 14384
qin - 14379
qing - 14368
qiong - 14355
qiu - 14353
qu - 14345
quan - 14170
que - 14159
qun - 14151
ran - 14149
rang - 14145
rao - 14140
re - 14137
ren - 14135
reng - 14125
ri - 14123
rong - 14122
rou - 14112
ru - 14109
ruan - 14099
rui - 14097
run - 14094
ruo - 14092
sa - 14090
sai - 14087
san - 14083
sang - 13917
sao - 13914
se - 13910
sen - 13907
seng - 13906
sha - 13905
shai - 13896
shan - 13894
shang - 13878
shao - 13870
she - 13859
shen - 13847
sheng - 13831
shi - 13658
shou - 13611
shu - 13601
shua - 13406
shuai - 13404
shuan - 13400
shuang - 13398
shui - 13395
shun - 13391
shuo - 13387
si - 13383
song - 13367
sou - 13359
su - 13356
suan - 13343
sui - 13340
sun - 13329
suo - 13326
ta - 13318
tai - 13147
tan - 13138
tang - 13120
tao - 13107
te - 13096
teng - 13095
ti - 13091
tian - 13076
tiao - 13068
tie - 13063
ting - 13060
tong - 12888
tou - 12875
tu - 12871
tuan - 12860
tui - 12858
tun - 12852
tuo - 12849
wa - 12838
wai - 12831
wan - 12829
wang - 12812
wei - 12802
wen - 12607
weng - 12597
wo - 12594
wu - 12585
xi - 12556
xia - 12359
xian - 12346
xiang - 12320
xiao - 12300
xie - 12120
xin - 12099
xing - 12089
xiong - 12074
xiu - 12067
xu - 12058
xuan - 12039
xue - 11867
xun - 11861
ya - 11847
yan - 11831
yang - 11798
yao - 11781
ye - 11604
yi - 11589
yin - 11536
ying - 11358
yo - 11340
yong - 11339
you - 11324
yu - 11303
yuan - 11097
yue - 11077
yun - 11067
za - 11055
zai - 11052
zan - 11045
zang - 11041
zao - 11038
ze - 11024
zei - 11020
zen - 11019
zeng - 11018
zha - 11014
zhai - 10838
zhan - 10832
zhang - 10815
zhao - 10800
zhe - 10790
zhen - 10780
zheng - 10764
zhi - 10587
zhong - 10544
zhou - 10533
zhu - 10519
zhua - 10331
zhuai - 10329
zhuan - 10328
zhuang - 10322
zhui - 10315
zhun - 10309
zhuo - 10307
zi - 10296
zong - 10281
zou - 10274
zu - 10270
zuan - 10262
zui - 10260
zun - 10256
zuo - 10254