Discuz 高效wap编码转换类,很强的,gbk-utf8 or unicode

原创 2006年06月10日 16:21:00
<?php
/*
    [Discuz!] (C)2001-2006 Comsenz Inc.
    This is NOT a freeware, use is subject to license terms

    $RCSfile: chinese.class.php,v $
    $Revision: 1.7 $
    $Date: 2006/02/23 13:44:02 $
*/

if(!defined('IN_DISCUZ')) {
        exit(
'Access Denied');
}

define('CODETABLE_DIR', DISCUZ_ROOT.'./include/tables/');

class
Chinese {

    var
$table = '';
    var
$iconv_enabled = false;
    var
$unicode_table = array();
    var
$config  =  array
        (
        
'SourceLang'            => '',                    //  original charset
        
'TargetLang'            => '',                    //  target charset
        
'GBtoUnicode_table'     => 'gb-unicode.table',    //  GB2312 to unicode
        
'BIG5toUnicode_table'   => 'big5-unicode.table',  //  BIG5 to unicode
        
);

    function
Chinese($SourceLang, $TargetLang) {

        
$this->config['SourceLang'] = $this->_lang($SourceLang);
        
$this->config['TargetLang'] = $this->_lang($TargetLang);

        if(
function_exists('iconv')) {
            
$this->iconv_enabled = true;
        } else {
            
$this->iconv_enabled = false;
            
$this->OpenTable();
        }
    }

    function
_lang($LangCode) {

        
$LangCode = strtoupper($LangCode);

        if(
substr($LangCode, 0, 2) == 'GB') {
            return
'GBK';
        } elseif(
substr($LangCode, 0, 3) == 'BIG') {
            return
'BIG5';
        } elseif(
substr($LangCode, 0, 3) == 'UTF') {
            return
'UTF-8';
        } elseif(
substr($LangCode, 0, 3) == 'UNI') {
            return
'UNICODE';
        }
    }

    function
_hex2bin($hexdata) {
        for(
$i=0; $i < strlen($hexdata); $i += 2) {
            
$bindata .= chr(hexdec(substr($hexdata, $i, 2)));
        }
        return
$bindata;
    }

    function
OpenTable() {
        
$this->unicode_table = array();
        if(
$this->config['SourceLang'] == 'GBK' || $this->config['TargetLang'] == 'GBK') {
            
$this->table = CODETABLE_DIR.$this->config['GBtoUnicode_table'];
        } elseif(
$this->config['SourceLang'] == 'BIG5' || $this->config['TargetLang'] == 'BIG5') {
            
$this->table = CODETABLE_DIR.$this->config['BIG5toUnicode_table'];
        }
        
$fp = fopen($this->table, 'rb');
        
$tabletmp = fread($fp, filesize($this->table));
        for(
$i = 0; $i < strlen($tabletmp); $i += 4) {
            
$tmp = unpack('nkey/nvalue', substr($tabletmp, $i, 4));
            if(
$this->config['TargetLang'] == 'UTF-8') {
                
$this->unicode_table[$tmp['key']] = '0x'.dechex($tmp['value']);
            } elseif(
$this->config['SourceLang'] == 'UTF-8') {
                
$this->unicode_table[$tmp['value']] = '0x'.dechex($tmp['key']);
            } elseif(
$this->config['TargetLang'] == 'UNICODE') {
                
$this->unicode_table[$tmp['key']] = dechex($tmp['value']);
            }
        }
    }

    function
CHSUtoUTF8($c) {
        
$str = '';
        if(
$c < 0x80) {
            
$str .= $c;
        } elseif(
$c < 0x800) {
            
$str .= (0xC0 | $c >> 6);
            
$str .= (0x80 | $c & 0x3F);
        } elseif(
$c < 0x10000) {
            
$str .= (0xE0 | $c >> 12);
            
$str .= (0x80 | $c >> 6 & 0x3F);
            
$str .=( 0x80 | $c & 0x3F);
        } elseif (
$c < 0x200000) {
            
$str .= (0xF0 | $c >> 18);
            
$str .= (0x80 | $c >> 12 & 0x3F);
            
$str .= (0x80 | $c >> 6 & 0x3F);
            
$str .= (0x80 | $c & 0x3F);
        }
        return
$str;
    }

    function
Convert($SourceText) {
        if(
$this->config['SourceLang'] == $this->config['TargetLang']) {
            return
$SourceText;
        } elseif(
$this->iconv_enabled) {
            return
iconv($this->config['SourceLang'], $this->config['TargetLang'], $SourceText);
        } elseif(
$this->config['TargetLang'] == 'UNICODE') {
            
$utf = '';
            while(
$SourceText) {
                if(
ord(substr($SourceText, 0, 1)) > 127) {
                    if (
$this->config['SourceLang'] == 'GBK') {
                        
$utf .= '&#x'.$this->unicode_table[hexdec(bin2hex(substr($SourceText, 0, 2))) - 0x8080].';';
                    } elseif(
$this->config['SourceLang'] == 'BIG5') {
                        
$utf .= '&#x'.$this->unicode_table[hexdec(bin2hex(substr($SourceText, 0, 2)))].';';
                    }
                    
$SourceText = substr($SourceText, 2, strlen($SourceText));
                } else {
                    
$utf .= substr($SourceText, 0, 1);
                    
$SourceText = substr($SourceText, 1, strlen($SourceText));
                }
            }
            return
$utf;
        } else {
            
$ret = '';
            if(
$this->config['SourceLang'] == 'UTF-8') {
                
$out = '';
                
$len = strlen($SourceText);
                
$i = 0;
                while(
$i < $len) {
                    
$c = ord(substr($SourceText, $i++, 1));
                    switch(
$c >> 4) {
                        case
0: case 1: case 2: case 3: case 4: case 5: case 6: case 7:
                            
$out .= substr($SourceText, $i - 1, 1);
                            break;
                        case
12: case 13:
                            
$char2 = ord(substr($SourceText, $i++, 1));
                            
$char3 = $this->unicode_table[(($c & 0x1F) << 6) | ($char2 & 0x3F)];
                            if(
$this->config['TargetLang'] == 'GBK') {
                                
$out .= $this->_hex2bin(dechex($char3 + 0x8080));
                            } elseif(
$this->config['TargetLang'] == 'BIG5') {
                                
$out .= $this->_hex2bin($char3);
                            }
                            break;
                        case
14:
                            
$char2 = ord(substr($SourceText, $i++, 1));
                            
$char3 = ord(substr($SourceText, $i++, 1));
                            
$char4 = $this->unicode_table[(($c & 0x0F) << 12) | (($char2 & 0x3F) << 6) | (($char3 & 0x3F) << 0)];
                            if (
$this->config['TargetLang'] == 'GBK') {
                                
$out .= $this->_hex2bin(dechex($char4 + 0x8080));
                            } elseif(
$this->config['TargetLang'] == 'BIG5') {
                                
$out .= $this->_hex2bin($char4);
                            }
                            break;
                    }
                }
                return
$out;
            } else {
                while(
$SourceText) {
                    if(
ord(substr($SourceText, 0, 1)) > 127){
                        if(
$this->config['SourceLang'] == 'BIG5') {
                            
$utf8 = $this->CHSUtoUTF8(hexdec($this->unicode_table[hexdec(bin2hex(substr($SourceText, 0, 2)))]));
                        } elseif(
$this->config['SourceLang'] == 'GBK') {
                            
$utf8=$this->CHSUtoUTF8(hexdec($this->unicode_table[hexdec(bin2hex(substr($SourceText, 0, 2))) - 0x8080]));
                        }
                        for(
$i = 0; $i < strlen($utf8); $i += 3) {
                            
$ret .= chr(substr($utf8, $i, 3));
                        }
                        
$SourceText = substr($SourceText, 2, strlen($SourceText));
                    } else {
                        
$ret .= substr($SourceText, 0, 1);
                        
$SourceText = substr($SourceText, 1, strlen($SourceText));
                    }
                }
                
$this->unicode_table = array();
                
$SourceText = '';
                return
$ret;
            }
        }
    }
}
?>

相关文章推荐

不依赖任何系统API,用c语言实现gbk/utf8/unicode编码转换

汉字'我' Unicode编码是 0x6211       01100010 00010001 UTF8编码是    0xe68891    11100110 10001000 100010001 ...

GBK与UTF8编码转换的QT4源码

  • 2012年03月01日 23:08
  • 186KB
  • 下载

UTF8和gbk编码转换(一)

在项目中,经常遇到将gbk编码与utf8编码进行转换的情况。如在linux系统中对windows下文件进行操作,或是windows下对Linux文件操作。这是因为在Windows下的默认字符编码格式是...

GBK-UTF8编码转换工具

  • 2011年03月11日 00:19
  • 129KB
  • 下载

Qt4.8编码转换之GBK2UTF8

Qt4.8编码转换之GBK2UTF8在项目中难免遇到编码转换的问题,一般我的开发项目中,代码的编码格式都是GBK,但是在其他平台上可能用的编码方式为UTF8,这时候编码转化就成为一个问题了。在之前的开...

Android NDK的C/C++代码中利用JNI回调实现字符编码转换的试验(中文UTF8与GBK)

在NDK下做网络传输时,遇到一个编码转换的问题,就是对方传过来的文件名是中文GBK编码,需要转成UTF8来处理。 平常在C/C++环境下编程时,系统都会提供字符编码转换的API。如Windows下有M...
  • huzgd
  • huzgd
  • 2011年02月14日 00:17
  • 8386

JS中utf8和GBK的字符编码转换

资料链接:http://igogogo9.iteye.com/blog/105669 1、PHP中的 json_encode 函数只限编码UTF-8的数据,当转换GBK或者GB2312等编码的数据...

UTF8和gbk编码转换(二)

Linux下: linux下并没有前面提到的两个函数,需要使用函数mbstowcs和wcstombs。 mbstowcs将多字节编码转为宽字节编码;wcstombs将宽字节编码转换为多字节编码。 ...
内容举报
返回顶部
收藏助手
不良信息举报
您举报文章:Discuz 高效wap编码转换类,很强的,gbk-utf8 or unicode
举报原因:
原因补充:

(最多只允许输入30个字)