php gbk 拼音,PHP获取可以用GBK编码的汉字拼音首字母

跳至'gbk','out'=>'utf-8');

//例外处理//当没有拼音首字母other===false的情况下,返回原字串否则设置为该值

protected $other = '!';//other=false;

public function __construct($flag=true){

$this->flag = $flag;

//加载资源

$this->source['gk221']=file_get_contents(DATA_PATH.'word/gk2-2-1.txt');

$this->source['gk31']=file_get_contents(DATA_PATH.'word/gk3-1.txt');

$this->source['gk41']=file_get_contents(DATA_PATH.'word/gk4-1.txt');

$this->pos=json_decode(file_get_contents(DATA_PATH.'word/pos.txt'),true);

}

/**

* 获得拼音首字母

* @param string/arr $data 数据,可以是字符串和数组

* @param string $in 数据的编码

* @param string $out 输出的编码

* @return arr 返回数组

*/

public function getInitial($data,$in='gbk',$out='utf-8'){

if(is_string($data)){

return self::getInitialByStr($data,$in,$out);

}elseif(is_array($data)){

return self::getInitialByArr($data,$in,$out);

}

}

/**

* 获得拼音首字母

* @param string $data 字符串数据

* @param string $in 数据的编码

* @param string $out 输出的编码

* @return arr 返回数组

*/

public function getInitialByStr($str,$in='gbk',$out='utf-8'){

$this->charset['in'] = strtolower($in);

$this->charset['out'] = strtolower($out);

if($this->flag!=true){

$this->temp['fws']= array();

}

switch ($this->charset['in']) {

case 'gbk':

return self::_getInitialInGBK($str);

break;

case 'utf-8':

return self::_getInitialInUTF8($str);

break;

default:

# code...

break;

}

//历史数据

if($this->flag!=true){

unset($this->temp['fws']);

}

}

/**

* 获得拼音首字母

* @param array $data 数组数据

* @param string $in 数据的编码

* @param string $out 输出的编码

* @return arr 返回数组

*/

public function getInitialByArr($arr,$in='gbk',$out='utf-8'){

$this->charset['in'] = strtolower($in);

$this->charset['out'] = strtolower($out);

if($this->flag!=true){

$this->temp['fws']= array();

}

switch ($this->charset['in']) {

case 'gbk':

return self::_getInitialInGBKArr($arr);

break;

case 'utf-8':

return self::_getInitialInUTF8Arr($arr);

break;

default:

# code...

break;

}

//历史数据

if($this->flag!=true){

unset($this->temp['fws']);

}

}

/**

* 处理gbk编码字符串的首字母

* @param string $str 字符串

* @return array 数组

*/

protected function _getInitialInGBK($str){

//存放字符串拼音

$w = array();

$i = 0;

$str_length = strlen($str); //字符串的字节数

while ($i= 0x81){//gbk区域

$nstr = substr($str, $i, 2);

$i = $i + 2;

}else{

$nstr = substr($str, $i, 1);

$i = $i + 1;

}

$this->word = iconv('gbk','utf-8',$nstr);

if(isset($this->temp['fws'][$nstr])){

$w[] = $this->temp['fws'][$nstr];

}else{

$w[] = self::_preGetInitial($nstr);

}

}

return $w;

}

/**

* 处理gbk编码数组的首字母

* @param array $arr 字符串单字数组

* @return array 数组

*/

protected function _getInitialInGBKArr($arr){

//存放字符串拼音

$w = array();

foreach ($arr as $key => $word) {

$this->word = iconv('gbk','utf-8',$word);

if(isset($this->temp['fws'][$word])){

$w[] = $this->temp['fws'][$word];

}else{

$w[] = self::_preGetInitial($word);

}

}

return $w;

}

/**

* 处理utf-8编码字符串的首字母

* @param string $str 字符串

* @return array 数组

*/

protected function _getInitialInUTF8($str){

//存放字符串拼音

$w = array();

$nstr = '';

$i = 0;

$str_length = strlen($str); //字符串的字节数

while ($i= 252){//如果ASCII位高与252

$nstr = substr($str, $i, 6); //根据UTF-8编码规范,将6个连续的字符计为单个字符

$i = $i + 6; //实际Byte计为6

}elseif ( $ascnum >= 248 ){//如果ASCII位高与248

$nstr = substr($str, $i, 5); //根据UTF-8编码规范,将5个连续的字符计为单个字符

$i = $i + 5; //实际Byte计为5

}elseif ( $ascnum >= 240 ){//如果ASCII位高与240

$nstr = substr($str, $i, 4); //根据UTF-8编码规范,将4个连续的字符计为单个字符

$i = $i + 4; //实际Byte计为4

}elseif ( $ascnum >= 224 ){//如果ASCII位高与224

$nstr = substr($str, $i, 3); //根据UTF-8编码规范,将3个连续的字符计为单个字符

$i = $i + 3 ; //实际Byte计为3

}elseif ( $ascnum >= 192 ){//如果ASCII位高与192

$nstr = substr($str, $i, 2); //根据UTF-8编码规范,将2个连续的字符计为单个字符

$i = $i + 2; //实际Byte计为2

}else{//其他情况下,包括大写字母,小写字母和半角标点符号,%,&,@,m,w等

$nstr = substr($str, $i, 1);

$i = $i + 1; //实际的Byte数计1个

}

$this->word = $nstr;

//编码转换至GBK

$nstr = iconv('utf-8','gbk',$nstr);

if(isset($this->temp['fws'][$nstr])){

$w[] = $this->temp['fws'][$nstr];

}else{

$w[] = self::_preGetInitial($nstr);

}

}

return $w;

}

/**

* 处理utf-8编码数组的首字母

* @param array $arr 字符串单字数组

* @return array 数组

*/

protected function _getInitialInUTF8Arr($arr){

//存放字符串拼音

$w = array();

foreach ($arr as $key => $word) {

$this->word = $word;

$nword = iconv('utf-8','GBK',$word);

if(isset($this->temp['fws'][$nword])){

$w[] = $this->temp['fws'][$nword];

}else{

$w[] = self::_preGetInitial($nword);

}

}

return $w;

}

/**

* 对单字预处理

* @param string $word 单字,gbk编码

* @return string 拼音首字母,编码视$this->charset['out']

*/

protected function _preGetInitial($word){

$fw = self::_getInitial($word);//返回的utf-8编码数据首字母

if($fw!==false){

$nstr=$this->temp['fws'][$word]=iconv('utf-8',$this->charset['out'],$fw);

}else{

$nstr=$this->temp['fws'][$word]=iconv('gbk',$this->charset['out'],$word);

}

return $nstr;

}

/**

* 获得汉字拼音首字母的核心函数

* @param string $word 单字,gbk编码

* @return string 首字母,utf-8编码

*/

protected function _getInitial($word){

$high = ord($word{0});

$low = ord($word{1});

//对20902个汉字支持拼音首字母提取

$hexc = $high * 256 + $low;

//GBK/2:gb2312汉字表(拼音排序),低位a0开始

if($hexc >= 0xB0A1 and $hexc =0xA0){

//共3755个字

return self::_getInGBK21($hexc);

}

//GBK/2:gb2312汉字表,低位a0开始

if($hexc >= 0xD8A1 and $hexc =0xA0){

//共3008个字

return self::_getInGBK('gk221');

}

//GBK/3:扩充汉字表(UCS 代码大小排列)

if($hexc >= 0x8140 and $hexc = 0xAA40 and $hexc other;

}

/**

* 获取首字母

* GBK/2:gb2312汉字表(拼音序)

* 共3755个字

* @param int $hexc 单字GBK编码值

* @return string 首字母,utf-8编码

*/

protected function _getInGBK21($hexc){

//无i,u,v开始的拼音

$char = array("",//填充位置

"A","B","C","D","E","F",

"G","H","J","K","L","M",

"N","O","P","Q","R","S",

"T","W","X","Y","Z"

);

$hcs = array(

0xB0A1,0xb0c5,0xb2c1,0xb4ee,0xb6ea,0xb7a2,

0xb8c1,0xb9fe,0xbbf7,0xbfa6,0xc0ac,0xc2e8,

0xc4c3,0xc5b6,0xc5be,0xc6da,0xc8bb,0xc8f6,

0xcbfa,0xcdda,0xcef4,0xd1b9,0xd4d1

);

if($key=array_search($hexc,$hcs)){

return $char[$key];

}else{

$hcs[] = $hexc;

sort($hcs);

return $char[array_search($hexc,$hcs)];

}

}

/**

* 获取首字母

* @param string $type 单字所属GBK区域类型

* @return string 首字母,utf-8编码

*/

protected function _getInGBK($type){

//无i,u,v开始的拼音

$char = array("",//填充位置

"A","B","C","D","E","F",

"G","H","J","K","L","M",

"N","O","P","Q","R","S",

"T","W","X","Y","Z"

);

$str = str_replace("\r\n",'',$this->source[$type]);

$p = stripos($str,$this->word)+3;//居右//stripos($str,$word),居左

$str = '';

if($key=array_search($p,$this->pos[$type])){

return $char[$key];

}else{

$pos = $this->pos[$type];

$pos[] = $p;

sort($pos);

return $char[array_search($p,$pos)];

}

}

}

?>

本文原创发布php中文网,转载请注明出处,感谢您的尊重!

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值