utf8字符串截取

utf8字符串是变长字符串,在日常处理时字符串截取时,如果处理不好则会出现乱码,针对这个问题下面给出一个通用utf8字符串截取的示例,下面给出的是用pythonphp实现根据原理可以用任何语言来实现.
# python
# utf8 substr
def safestrlength_utf8(sourcestr): # {{{
    i = 0;
    n = 0;
    str_length = len(sourcestr);

    while i < str_length:
        temp_str = sourcestr[i:i+1]
        ascnum = ord(temp_str)
        if ascnum >= 252:
            i = i + 6
        elif ascnum >= 248:
            i = i + 5
        elif ascnum >= 240:
            i = i + 4
        elif ascnum >= 224:
            i = i + 3
        elif ascnum >= 192:
            i = i + 2
        elif ascnum >= 65 and ascnum <= 90:
            i = i + 1
        else:
            i = i + 1
        n = n + 1
    n = n - 1
    return n

# utf8 string length
def safesubstr_utf8(sourcestr, cutlength):
    returnlist = []
    i = 0
    n = 0
    str_length = len(sourcestr)

    while (n < cutlength) and (i <= str_length):
        temp_str = sourcestr[i:i+1]
        ascnum = ord(temp_str)
        if ascnum >= 252:
            returnlist.append(sourcestr[i:i+6])
            i = i + 6
        elif ascnum >= 248:
            returnlist.append(sourcestr[i:i+5])
            i = i + 5
        elif ascnum >= 240:
            returnlist.append(sourcestr[i:i+4])
            i = i + 4
        elif ascnum >= 224:
            returnlist.append(sourcestr[i:i+3])
            i = i + 3
        elif ascnum >= 192:
            returnlist.append(sourcestr[i:i+2])
            i = i + 2
        elif ascnum >= 65 and ascnum <= 90:
            returnlist.append(sourcestr[i:i+1])
            i = i + 1
        else:
            returnlist.append(sourcestr[i:i+1])
            i = i + 1
        n = n + 1;
    return "".join(returnlist)

// php
// substr for utf8 string, then utf8 word is 1 length
public static function safesubstr_utf8($sourcestr, $cutlength) // {{{
{          
    $returnstr = '';
    $i = 0;
    $n = 0;
    $str_length = strlen($sourcestr);
               
    while(($n < $cutlength) && ($i <= $str_length))
    {  
        $temp_str = substr($sourcestr, $i, 1);
        $ascnum = Ord($temp_str);
        if($ascnum >= 252)
        {
            $returnstr = $returnstr.substr($sourcestr, $i, 6);
            $i = $i + 6;
        }  
        elseif($ascnum >= 248)
        {
            $returnstr = $returnstr.substr($sourcestr, $i, 5);
            $i = $i + 5;
    }  
        elseif($ascnum >= 240)
        {  
            $returnstr = $returnstr.substr($sourcestr, $i, 4);
            $i = $i + 4;
        }  
        elseif($ascnum >= 224)
        {
            $returnstr = $returnstr.substr($sourcestr, $i, 3);
            $i = $i + 3;
        }
        elseif($ascnum >= 192)
        {
            $returnstr = $returnstr.substr($sourcestr, $i, 2);
            $i = $i + 2;
        }
        elseif($ascnum >= 65 && $ascnum <= 90)
        {
            $returnstr = $returnstr.substr($sourcestr, $i, 1);
            $i = $i + 1;
        }
        else
        {
            $returnstr = $returnstr.substr($sourcestr, $i, 1);
            $i = $i + 1;
        }
        $n++;
    }

    return $returnstr;
} // }}}

// get length for utf8 string, then utf8 word is 1 length
public static function safestrlength_utf8($sourcestr) // {{{
{
    $i = 0;
    $n = 0;
    $str_length = strlen($sourcestr);

    while($i <= $str_length)
    {
        $temp_str = substr($sourcestr, $i, 1);
        $ascnum = Ord($temp_str);
        if($ascnum >= 252)
        {
            $i = $i + 6;
        }
        elseif($ascnum >= 248)
        {
            $i = $i + 5;
        }
        elseif($ascnum >= 240)
        {
            $i = $i + 4;
        }
        elseif($ascnum >= 224)
        {
            $i = $i + 3;
        }
        elseif($ascnum >= 192)
        {
            $i = $i + 2;
        }
        elseif($ascnum >= 65 && $ascnum <= 90)
        {
            $i = $i + 1;
        }
        else
        {
            $i = $i + 1;
        }
        $n++;
    }
    $n--;
return $n;
} // }}}

U-00000000 - U-0000007F:

0xxxxxxx

U-00000080 - U-000007FF:

110xxxxx 10xxxxxx

U-00000800 - U-0000FFFF:

1110xxxx 10xxxxxx 10xxxxxx

U-00010000 - U-001FFFFF:

11110xxx 10xxxxxx 10xxxxxx 10xxxxxx

U-00200000 - U-03FFFFFF:

111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

U-04000000 - U-7FFFFFFF:

1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值