PHP截取中文字符串,UTF-8、GBK
因为证书中有中文,所以需要在PHP中进行GB2312与UTF-8的互换。
网上搜索一下这方面相关资料,说是需要php_iconv.dll的支持,可是我在PHP5文件夹中根本找不到这个文件,但是奇怪的是在PHP4中有这个,然后我将PHP4中的php_iconv.dll文件,复制到system32下,却提示出现错误,我想应该也不行,毕竟PHP4和PHP5里面的文件应该不兼容。到这里我就想删除了PHP5,装一个PHP4算了,后来发现一段话:iconv and libxml are compiled into php5ts.dll so you don't need the dll's in version 5.所以只要转换如下:
不管是uft-8编码转换为gb2312,还是将 gb2312 转换为 uft-8 ,都是一样道理,php4.3.1以后的iconv函数很好用的,只是需要自己写一个uft8到unicode的转换函数查表(gb2312.txt)也行。
- <?php
- function substring($str, $start, $len) {
- $tmpstr = "";
- $strlen = $start + $len;
- for($i = 0; $i < $strlen; $i++) {
- if(ord(substr($str, $i, 1)) > 0xa0) {
- $tmpstr .= substr($str, $i, 2);
- $i++;
- } else
- $tmpstr .= substr($str, $i, 1);
- }
- return $tmpstr;
- }
- ?>
- <?php
- function c_substr($string, $from, $length = null){
- preg_match_all('/[\x80-\xff]?./', $string, $match);
- if(is_null($length)){
- $result = implode('', array_slice($match[0], $from));
- }else{
- $result = implode('', array_slice($match[0], $from, $length));
- }
- return $result;
- }
- $str = "zhon华人min共和guo";
- $from = 3;
- $length = 7;
- echo(c_substr($str, $from, $length));
- // 输出: n华人min共
- //还有utf-8的
- /*
- Regarding windix's function to handle UTF-8 strings:
- one can use the "u" modifier on the regular expression so that the pattern string is treated as UTF-8
- (available from PHP 4.1.0 or greater on Unix and from PHP 4.2.3 on win32).
- This way the function works for other encodings too (like Greek for example).
- The modified function would read like this:
- */
- function utf8_substr($str,$start) {
- $null = "";
- preg_match_all("/./u", $str, $ar);
- if(func_num_args() >= 3) {
- $end = func_get_arg(2);
- return join($null, array_slice($ar[0],$start,$end));
- } else {
- return join($null, array_slice($ar[0],$start));
- }
- }
- ?>
- <?php
- function cnSubStr($string,$sublen)
- {
- if($sublen>=strlen($string))
- {
- return $string;
- }
- $s="";
- for($i=0;$i<$sublen;$i++)
- {
- if(ord($string{$i})>127)
- {
- $s.=$string{$i}.$string{++$i};
- continue;
- }else{
- $s.=$string{$i};
- continue;
- }
- }
- return $s;
- }
- //Example:
- echo "<p>__________________________<p>";
- $string="242432反对感是456犯得上广泛大使馆地方7890";
- $sublen=strlen($string);
- $len=20;
- echo $string."<p>";
- echo "总长为:".($sublen+1)."<p>";
- echo "截取数:".$len."<p>";
- for($i=1;$i<=$sublen+1;$i++){
- if($i>$len){
- echo $i."<b> →</b> ".cnSubStr($string,$i)."…<br>";
- continue;
- }
- echo $i."<b> →</b> ".cnSubStr($string,$i)."<br>";
- }
- ?>
因为证书中有中文,所以需要在PHP中进行GB2312与UTF-8的互换。
网上搜索一下这方面相关资料,说是需要php_iconv.dll的支持,可是我在PHP5文件夹中根本找不到这个文件,但是奇怪的是在PHP4中有这个,然后我将PHP4中的php_iconv.dll文件,复制到system32下,却提示出现错误,我想应该也不行,毕竟PHP4和PHP5里面的文件应该不兼容。到这里我就想删除了PHP5,装一个PHP4算了,后来发现一段话:iconv and libxml are compiled into php5ts.dll so you don't need the dll's in version 5.所以只要转换如下:
- //GB2312 -- UTF-8
- iconv("GB2312","UTF-8",$text)
- //UTF-8 -- GB2312
- iconv("UTF-8","GB2312",$text)
php5內建支援iconv,因此僅有php4的使用者需要安裝
1.首先必須確定你的php資料夾中有extensions,dlls這兩個資料夾
若沒有,請到 http://www.php.net/downloads.php 下載PHP 4.4.x zip package的版本(約7MB)
2.將extensions\php_iconv.dll , dlls\iconv.dll 這兩個檔案複製到windows的目錄下方
3.開啟php.ini(一般是在windows目錄下),將 ;extension=php_iconv.dll 前面的分號去除
4.重新啟動Apache即可
【摘 要】 不管是uft-8编码转换为gb2312,还是将 gb2312 转换为 uft-8 ,都是一样道理,php4.3.1以后的iconv函数很好用的,只是需要自己写一个uft8到unicode的转换函数查表(gb2312.txt)也行不管是uft-8编码转换为gb2312,还是将 gb2312 转换为 uft-8 ,都是一样道理,php4.3.1以后的iconv函数很好用的,只是需要自己写一个uft8到unicode的转换函数查表(gb2312.txt)也行。
- <?
- $text = "电子书库";
- preg_match_all("/[\x80-\xff]?./",$text,$ar);
- foreach($ar[0] as $v)
- echo "&#".utf8_unicode(iconv("GB2312","UTF-8",$v)).";";
- ?>
- <?
- // utf8 -> unicode
- function utf8_unicode($c) {
- switch(strlen($c)) {
- case 1:
- return ord($c);
- case 2:
- $n = (ord($c[0]) & 0x3f) << 6;
- $n += ord($c[1]) & 0x3f;
- return $n;
- case 3:
- $n = (ord($c[0]) & 0x1f) << 12;
- $n += (ord($c[1]) & 0x3f) << 6;
- $n += ord($c[2]) & 0x3f;
- return $n;
- case 4:
- $n = (ord($c[0]) & 0x0f) << 18;
- $n += (ord($c[1]) & 0x3f) << 12;
- $n += (ord($c[2]) & 0x3f) << 6;
- $n += ord($c[3]) & 0x3f;
- return $n;
- }
- }
- ?>
下面的例子是利用php将uft-8这中编码转换为gb2312.
- <?php
- function u2utf82gb($c){
- $str="";
- if ($c < 0x80) {
- $str.=$c;
- } else if ($c < 0x800) {
- $str.=chr(0xC0 | $c>>6);
- $str.=chr(0x80 | $c & 0x3F);
- } else if ($c < 0x10000) {
- $str.=chr(0xE0 | $c>>12);
- $str.=chr(0x80 | $c>>6 & 0x3F);
- $str.=chr(0x80 | $c & 0x3F);
- } else if ($c < 0x200000) {
- $str.=chr(0xF0 | $c>>18);
- $str.=chr(0x80 | $c>>12 & 0x3F);
- $str.=chr(0x80 | $c>>6 & 0x3F);
- $str.=chr(0x80 | $c & 0x3F);
- }
- return iconv('UTF-8', 'GB2312', $str);
- }
- ?>
或者是
- <?php
- function unescape($str) {
- $str = rawurldecode($str);
- preg_match_all("/(?:%u.{4})|&#x.{4};|&#\d+;|.+/U",$str,$r);
- $ar = $r[0];
- print_r($ar);
- foreach($ar as $k=>$v) {
- if(substr($v,0,2) == "%u")
- $ar[$k] = iconv("UCS-2","GB2312",pack("H4",substr($v,-4)));
- elseif(substr($v,0,3) == "&#x")
- $ar[$k] = iconv("UCS-2","GB2312",pack("H4",substr($v,3,-1)));
- elseif(substr($v,0,2) == "&#") {
- echo substr($v,2,-1)."<br>";
- $ar[$k] = iconv("UCS-2","GB2312",pack("n",substr($v,2,-1)));
- }
- }
- return join("",$ar);
- }
- $str = "TTL全天候自动聚焦";
- echo unescape($str);
- ?>
- <?php
- $string="2006年4月我又长大了一岁!";
- echo substr($string,1)."...";
- //截取字符串
- function SubstrGB($in,$num){
- $pos=0;
- $out="";
- while($pos<strlen($in)){
- $c=substr($in,$pos,1);
- if($c=="\n") break;
- if(ord($c)>128){
- $out.=$c;
- $pos++;
- $c=substr($in,$pos,1);
- $out.=$c;
- }else{
- $out.=$c;
- }
- $pos++;
- if($pos>=$num) break;
- }
- return $out;
- }
- echo SubstrGB($string,8);
- ?>
- <?php
- function cut_str($string, $sublen, $start = 0, $code = 'UTF-8')
- {
- if($code == 'UTF-8')
- {
- $pa = "/[x01-x7f]|[xc2-xdf][x80-xbf]|xe0[xa0-xbf][x80-xbf]|[xe1-xef][x80-xbf][x80-xbf]|xf0[x90-xbf][x80-xbf][x80-xbf]|[xf1-xf7][x80-xbf][x80-xbf][x80-xbf]/";
- preg_match_all($pa, $string, $t_string);
- if(count($t_string[0]) - $start > $sublen) return join('', array_slice($t_string[0], $start, $sublen))."...";
- return join('', array_slice($t_string[0], $start, $sublen));
- }
- else
- {
- $start = $start*2;
- $sublen = $sublen*2;
- $strlen = strlen($string);
- $tmpstr = '';
- for($i=0; $i<$strlen; $i++)
- {
- if($i>=$start && $i<($start+$sublen))
- {
- if(ord(substr($string, $i, 1))>129) $tmpstr.= substr($string, $i, 2);
- else $tmpstr.= substr($string, $i, 1);
- }
- if(ord(substr($string, $i, 1))>129) $i++;
- }
- if(strlen($tmpstr)<$strlen ) $tmpstr.= "...";
- return $tmpstr;
- }
- }
- echo "<br>".cut_str($string,8,$start=0,$code='sdf') ;
- ?>
以下代码试用于GB2312编码,截取中文字符串是PHP中一个头疼的问题,解决方法是根据值是否大于等于128来判断是否是双字节字符,以避免出现乱码的情况。但中英文混合、特殊符号等问题总是存在,现在写一个比较全面的,仅供参考:
程序说明:
1. len 参数以中文字符为标准,1len等于2个英文字符,为了形式上好看些
2. 如果将magic参数设为false,则中文和英文同等看待,取绝对的字符数
3. 特别适用于用htmlspecialchars()进行过编码的字符串
4. 能正确处理GB2312中实体字符模式(𖰰)
程序代码:
- <?php
- function FSubstr($title,$start,$len="",$magic=true)
- {
- /**
- * powered by Smartpig
- * mailto:d.einstein@263.net
- */
- $length = 0;
- if($len == "") $len = strlen($title);
- //判断起始为不正确位置
- if($start > 0)
- {
- $cnum = 0;
- for($i=0;$i<$start;$i++)
- {
- if(ord(substr($title,$i,1)) >= 128) $cnum ++;
- }
- if($cnum%2 != 0) $start--;
- unset($cnum);
- }
- if(strlen($title)<=$len) return substr($title,$start,$len);
- $alen = 0;
- $blen = 0;
- $realnum = 0;
- for($i=$start;$i<strlen($title);$i++)
- {
- $ctype = 0;
- $cstep = 0;
- $cur = substr($title,$i,1);
- if($cur == "&")
- {
- if(substr($title,$i,4) == "<")
- {
- $cstep = 4;
- $length += 4;
- $i += 3;
- $realnum ++;
- if($magic)
- {
- $alen ++;
- }
- }
- else if(substr($title,$i,4) == ">")
- {
- $cstep = 4;
- $length += 4;
- $i += 3;
- $realnum ++;
- if($magic)
- {
- $alen ++;
- }
- }
- else if(substr($title,$i,5) == "&")
- {
- $cstep = 5;
- $length += 5;
- $i += 4;
- $realnum ++;
- if($magic)
- {
- $alen ++;
- }
- }
- else if(substr($title,$i,6) == """)
- {
- $cstep = 6;
- $length += 6;
- $i += 5;
- $realnum ++;
- if($magic)
- {
- $alen ++;
- }
- }
- else if(substr($title,$i,6) == "'")
- {
- $cstep = 6;
- $length += 6;
- $i += 5;
- $realnum ++;
- if($magic)
- {
- $alen ++;
- }
- }
- else if(preg_match("/&#(\d+);/i",substr($title,$i,8),$match))
- {
- $cstep = strlen($match[0]);
- $length += strlen($match[0]);
- $i += strlen($match[0])-1;
- $realnum ++;
- if($magic)
- {
- $blen ++;
- $ctype = 1;
- }
- }
- }else{
- if(ord($cur)>=128)
- {
- $cstep = 2;
- $length += 2;
- $i += 1;
- $realnum ++;
- if($magic)
- {
- $blen ++;
- $ctype = 1;
- }
- }else{
- $cstep = 1;
- $length +=1;
- $realnum ++;
- if($magic)
- {
- $alen++;
- }
- }
- }
- if($magic)
- {
- if(($blen*2+$alen) == ($len*2)) break;
- if(($blen*2+$alen) == ($len*2+1))
- {
- if($ctype == 1)
- {
- $length -= $cstep;
- break;
- }else{
- break;
- }
- }
- }else{
- if($realnum == $len) break;
- }
- }
- unset($cur);
- unset($alen);
- unset($blen);
- unset($realnum);
- unset($ctype);
- unset($cstep);
- return substr($title,$start,$length);
- }
- ?>