汉字转拼音的类

这个类使用查表法获取汉字的汉语拼音。对多音字只取常用的读音

涵盖 GBK 字库的全部汉字

对照表采用 UCS-2 内码顺序排列,有地址连续的优点。拼音信息(声母、韵母、调号)被压缩在两个字节内

所以有体积小,检索速度快的特点

可自动识别传入串的字符集

class Tpinyin {
  protected $dict = array();
  protected $dictfile = 'py.dict';
  protected $tune = array();
  private $charset = '';
  private $maxlength = 0;
  private $fp;
  function __construct() {
	$this->dict[] = explode(',', ',b,p,m,f,d,t,n,l,g,k,h,j,q,x,zh,ch,sh,r,z,c,s,y,e,w,a,o,e');
	$this->dict[] = explode(',', ',a,ai,an,ang,ao,e,ei,en,eng,ev,i,ia,ian,iang,iao,ie,in,ing,iong,iou,iu,o,ong,ou,r,u,ua,uai,uan,uang,ue,uei,uen,ueng,ui,un,uo,uong,v,van,vn,ve,n,ng');
  }
  function loaddict($fn='pylib.bmp') {
	file_put_contents($this->dictfile, '');
	$t = fopen($this->dictfile, 'rb+');
	$fp = fopen($fn, 'r');

	while($s = fgets($fp)) {
		$ar = explode(',', $s);
		preg_match('/(.h?)([a-z]*)(\d)/', $ar[1], $r);
		$n = (array_search($r[1], $dict[0])* 100 + array_search($r[2], $dict[1]))*10 + $r[3];
		$os = mb_convert_encoding($ar[0], "UCS-2", "UTF-8, GBK");
		$o = hexdec(bin2hex($os)) - 0x4e00;
		fseek($t, $o*2);
		fwrite($t, pack('S', $n));
	}
  }
  function settune() {
	$tune = array(
		'a' => array("\x01\x01","\x00\xe1","\x01\xce","\x00\xe0",),
		'e' => array("\x01\x13","\x00\xe9","\x01\x1b","\x00\xe8",),
		'i' => array("\x01\x2b","\x00\xed","\x01\xd0","\x00\xec",),
		'o' => array("\x01\x4d","\x00\xf3","\x01\xd2","\x00\xf2",),
		'u' => array("\x01\x6b","\x00\xfa","\x01\xd4","\x00\xf9",),
		'v' => array("\x01\xd6","\x01\xd8","\x01\xda","\x01\xdc",),
		);
	foreach($tune as $k=>$r)
		foreach($r as $i=>$v)
			$this->tune[$k][$i] = mb_convert_encoding($v, $this->charset, 'UCS-2');
  }
  function pinyin($str) {
	$this->charset = mb_check_encoding($str, 'UTF-8') ? 'UTF-8' : 'GBK';
	$this->settune();
	if(! file_exists($this->dictfile)) $this->loaddict();
	$this->fp = fopen($this->dictfile, 'rb');
	$this->maxlength = filesize($this->dictfile);
	$str = mb_convert_encoding($str, "UCS-2", "UTF-8, GBK");
	return array_map(array($this, 'pinyin_back'), str_split($str, 2));
  }
  function pinyin_back($ch) {
	if(ord($ch{0}) == 0) return $ch{1};
	$o = hexdec(bin2hex($ch)) - 0x4e00;
	if($o < 0 || $o >= $this->maxlength) return mb_convert_encoding($ch, $this->charset, 'UCS-2');
	fseek($this->fp, $o*2);
	$x = sprintf('%05d', current(unpack('S', fread($this->fp, 2))));
	$t = $this->dict[0][substr($x, 0, 2)+0] . $this->dict[1][substr($x, 2, 2)+0];
        $n = substr($x, -1) - 1;
	$s = $t;
	foreach($this->tune as $k=>$v) {
		$s = str_replace($k, $v[$n], $s);
		if($s != $t) break;
	}
	return $s;
  }
}

示例

$p = new Tpinyin;
$t = $p->pinyin('啊!中国');
echo join('', $t);

将输出 ā!zhōngguó


评论 14
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值