这个类使用查表法获取汉字的汉语拼音。对多音字只取常用的读音
涵盖 GBK 字库的全部汉字
对照表采用 UCS-2 内码顺序排列,有地址连续的优点。拼音信息(声母、韵母、调号)被压缩在两个字节内
所以有体积小,检索速度快的特点
可自动识别传入串的字符集
class Tpinyin {
protected $dict = array();
protected $dictfile = 'py.dict';
protected $tune = array();
private $charset = '';
private $maxlength = 0;
private $fp;
function __construct() {
$this->dict[] = explode(',', ',b,p,m,f,d,t,n,l,g,k,h,j,q,x,zh,ch,sh,r,z,c,s,y,e,w,a,o,e');
$this->dict[] = explode(',', ',a,ai,an,ang,ao,e,ei,en,eng,ev,i,ia,ian,iang,iao,ie,in,ing,iong,iou,iu,o,ong,ou,r,u,ua,uai,uan,uang,ue,uei,uen,ueng,ui,un,uo,uong,v,van,vn,ve,n,ng');
}
function loaddict($fn='pylib.bmp') {
file_put_contents($this->dictfile, '');
$t = fopen($this->dictfile, 'rb+');
$fp = fopen($fn, 'r');
while($s = fgets($fp)) {
$ar = explode(',', $s);
preg_match('/(.h?)([a-z]*)(\d)/', $ar[1], $r);
$n = (array_search($r[1], $dict[0])* 100 + array_search($r[2], $dict[1]))*10 + $r[3];
$os = mb_convert_encoding($ar[0], "UCS-2", "UTF-8, GBK");
$o = hexdec(bin2hex($os)) - 0x4e00;
fseek($t, $o*2);
fwrite($t, pack('S', $n));
}
}
function settune() {
$tune = array(
'a' => array("\x01\x01","\x00\xe1","\x01\xce","\x00\xe0",),
'e' => array("\x01\x13","\x00\xe9","\x01\x1b","\x00\xe8",),
'i' => array("\x01\x2b","\x00\xed","\x01\xd0","\x00\xec",),
'o' => array("\x01\x4d","\x00\xf3","\x01\xd2","\x00\xf2",),
'u' => array("\x01\x6b","\x00\xfa","\x01\xd4","\x00\xf9",),
'v' => array("\x01\xd6","\x01\xd8","\x01\xda","\x01\xdc",),
);
foreach($tune as $k=>$r)
foreach($r as $i=>$v)
$this->tune[$k][$i] = mb_convert_encoding($v, $this->charset, 'UCS-2');
}
function pinyin($str) {
$this->charset = mb_check_encoding($str, 'UTF-8') ? 'UTF-8' : 'GBK';
$this->settune();
if(! file_exists($this->dictfile)) $this->loaddict();
$this->fp = fopen($this->dictfile, 'rb');
$this->maxlength = filesize($this->dictfile);
$str = mb_convert_encoding($str, "UCS-2", "UTF-8, GBK");
return array_map(array($this, 'pinyin_back'), str_split($str, 2));
}
function pinyin_back($ch) {
if(ord($ch{0}) == 0) return $ch{1};
$o = hexdec(bin2hex($ch)) - 0x4e00;
if($o < 0 || $o >= $this->maxlength) return mb_convert_encoding($ch, $this->charset, 'UCS-2');
fseek($this->fp, $o*2);
$x = sprintf('%05d', current(unpack('S', fread($this->fp, 2))));
$t = $this->dict[0][substr($x, 0, 2)+0] . $this->dict[1][substr($x, 2, 2)+0];
$n = substr($x, -1) - 1;
$s = $t;
foreach($this->tune as $k=>$v) {
$s = str_replace($k, $v[$n], $s);
if($s != $t) break;
}
return $s;
}
}
示例
$p = new Tpinyin;
$t = $p->pinyin('啊!中国');
echo join('', $t);
将输出 ā!zhōngguó