UTF-8全攻略

参考文献:UTF-8 and Unicode FAQ

Unicode和UTF-8的对应关系
U-00000000 - U-0000007F: 0xxxxxxx
U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

1、本地字符集编码的字符串转UTF-8字节串(字节数组),字符串转字节串是有原因的,因为UTF-8编码是多字节的,最多有6个,ASP中的String是BSTR类型的,即双字节,不能将一个超过2个字节的UTF-8字符装入到一个BSTR字符中,所以手工转换,就只能用字节串,当然最好转成字节数组,即Byte(),第2节就讲解本地字符编码的字符串转UTF-8字节数组

'将数字lValue左移iBit位
Public Function LShift(ByVal lValue, ByVal iBit)
    LShift = lValue * (2 ^ iBit)
End Function

'将数字lValue右移iBit位
Public Function RShift(ByVal lValue, ByVal iBit)
    RShift = lValue / (2 ^ iBit)
End Function

Public Function ANSIToUTF8(ByVal strData)
    Dim ret
    Dim i, u, b1, b2, b3
    For i = 1 To Len(strData)
        u = AscW(Mid(strData, i, 1)) And &HFFFF& '获取本地字符的Unicode编码
        If u < 128 Then'小于128的UTF-8字节和Ascii字节一样的
            ret = ret & ChrB(u)
        ElseIf u < 2048 Then'小于2048的Unicode能转成两个字节的UTF-8
            '&H80的二进制为: 10000000
            '&H3F的二进制为: 00111111
            '&HC0的二进制为: 11000000
            '&H1F的二进制为: 00011111
            '用位运算把UTF-8字节取出来
            b2 = &H80 Xor (u And &H3F)
            u = RShift(u, 6)
            b1 = &HC0 Xor (u And &H1F)
            ret = ret & ChrB(b1) & ChrB(b2)
        Else'这里默认是转三个字节的UTF-8,因为现在我们大都是用0x00000000到0x0000FFFF这个段的Unicode编码,更高段的现在还没遇见过,遇见了再说
            '&HE0的二进制为: 11100000
            '&H0F的二进制为: 00001111
            b3 = &H80 Xor (u And &H3F)
            u = RShift(u, 6)
            b2 = &H80 Xor (u And &H3F)
            u = RShift(u, 6)
            b1 = &HE0 Xor (u And &H0F)
            ret = ret & ChrB(b1) & ChrB(b2) & ChrB(b3)
        End If
    Next
    ANSIToUTF8 = ret
End Function

'输出的时候这样用
Response.BinaryWrite ANSIToUTF8("大家好啊")

2、本地字符集编码的字符串转字节数组,ASP里面字符串转数组只能依赖组件,因为ASP的变量类型都是Variant的,其他类型都是Variant的子类型,一般转编码,我们都用ADODB.Stream,这个跟流有关的组件非常好用,我现在ASP中的output基本摒弃了Response.Write方法,直接用ADODB.Stream,它既能转编码又能存储字符,可以当Response.Buffer = True的时候缓存字符串用,基本是这样的
Public Function IIf(ByVal blnExp, vtTrue, vtFalse)
    If blnExp Then
        IIf = vtTrue
    Else
        IIf = vtFalse
    End If
End Function

Class ImplIO
Private objStream
Private lngCodePage

Private Sub Class_Initialize()
    Set objStream = Nothing
    lngCodePage = 936
End Sub

Public Property Let CodePage(ByVal lngData)
    lngCodePage = lngData
    If Not objStream Is Nothing Then
        Err.Raise vbObjectError + 1, "ImplIO.CodePage", "不能在调用Write方法后再设置CodePage"
    End If
    If lngCodePage = 65001 Then
        Response.CharSet = "UTF-8"
    Else
        Response.CharSet = "GBK"
    End If
End Property

Private Sub Class_Terminate()
    If objStream Is Nothing Then Exit Sub
    If objStream.state = &H00000001 Then'adStateOpen
        objStream.Position = 0
        If lngCodePage = 65001 Then
            objStream.Type = 1'adTypeBinary
            objStream.Read 3'去掉UTF-8 BOM签名
            Response.BinaryWrite objStream.Read(-1)
        Else
            Response.Write objStream.ReadText(-1)
        End If
        objStream.Close
    End If
    Set objStream = Nothing
End Sub

Public Sub Write(vtData)
    If objStream Is Nothing Then
        Set objStream = Server.CreateObject("ADODB.Stream")
        objStream.Type = 2'adTypeText
        objStream.Mode = 3'adModeReadWrite
        objStream.CharSet = IIf(lngCodePage = 65001, "UTF-8", "GBK")
        objStream.Open
    End If
    objStream.WriteText vtData
End Sub
End Class

Dim IO
Set IO = New ImplIO
IO.CodePage = 65001
IO.Write "大家好啊"
Set IO = Nothing'这里输出Write的字符

3,接下来就是UTF-8的URL编码和解码
URL编码跟字符串转字节串是一个道理,将ANSIToUTF8稍做更改即可
'判断是否是数字字节
Public Function isdigit(ByVal iValue)
    isdigit = CBool(iValue >= 48 And iValue <= 57)
End Function

'判断是否是字母字节
Public Function isalpha(ByVal iValue)
    isalpha = CBool((iValue >= 65 And iValue <= 90) Or (iValue >= 97 And iValue <= 122))
End Function

''判断是否是数字或字母字节
Public Function isalnum(ByVal iValue)
    isalnum = CBool(isdigit(iValue) Or isalpha(iValue))
End Function

Public Function MyHex(ByVal iValue)
    If iValue < 16 Then
        MyHex =  "0" & Hex(iValue)
    Else
        MyHex = Hex(iValue)
    End
End Function

Public Function URLEncode8(ByVal strData)
    Dim ret
    Dim i, k, u, l
    l = Len(strData)
    k = 0
    ReDim ret(l * 3)
    For i = 1 To l
        u = AscW(Mid(strData, i, 1)) And &HFFFF& '获取本地字符的Unicode编码
        If u < 128 Then'小于128的字节和Ascii字节一样的
            If isalnum(u) Then'如果是数字和字母,则用原文
                ret(k) = Chr(u)
            ElseIf u = 32 Then
                ret(k) = "+"
            Else
                ret(k) = "%" & MyHex(u)
            End If
            k = k + 1
        ElseIf u < 2048 Then'小于2048的Unicode能转成两个字节的UTF-8
            ret(k + 1) = "%" & MyHex(&H80 Xor (u And &H3F))
            u = RShift(u, 6)
            ret(k) = "%" & MyHex(&HC0 Xor (u And &H1F))
            k = k + 2
        Else'这里默认是转三个字节的UTF-8,因为现在我们大都是用0x00000000到0x0000FFFF这个段的Unicode编码,更高段的现在还没遇见过,遇见了再说
            ret(k + 2) = "%" & MyHex(&H80 Xor (u And &H3F))
            u = RShift(u, 6)
            ret(k + 1) = "%" & MyHex(&H80 Xor (u And &H3F))
            u = RShift(u, 6)
            ret(k) = "%" & MyHex(&HE0 Xor (u And &H0F))
            k = k + 3
        End If
    Next
    URLEncode8 = Join(ret, Empty)
End Function

'输出URLEncode8("大家好啊")的结果是%E5%A4%A7%E5%AE%B6%E5%A5%BD%E5%95%8A
'UTF-8的URL解码就将函数倒过来即可

Public Function URLDecode8(ByVal strData)
    Dim arr, ret
    Dim i, k, l, u, b1, b2, b3
    arr = Split(Replace(strData, "+", " "), "%")
    l = UBound(arr)
    k = 1
    ReDim ret(l)
    ret(0) = arr(0)
    For i = 1 To UBound(arr)
        u = CInt("&H" & Mid(arr(i), 1, 2))
        If u > 127 Then
            b1 = u
            Select Case UTF8Bytes(b1)
            Case 2
                b2 = CInt("&H" & Mid(arr(i + 1), 1, 2))
                i = i + 1
                u = LShift(b1 And &H3F, 6) Or LShift(b2 And &H3F, 0)
            Case 3
                b2 = CInt("&H" & Mid(arr(i + 1), 1, 2))
                b3 = CInt("&H" & Mid(arr(i + 2), 1, 2))
                i = i + 2
                u = LShift(b1 And &H0F, 12) Or LShift(b2 And &H3F, 6) Or LShift(b3 And &H3F, 0)
            Case Else
                Err.Raise vbObjectError + 1, "URLDecode8", "不支持的UTF-8编码"
            End Select
        End If
        ret(k) = ChrW(u) & Mid(arr(i), 3)
        k = k + 1
    Next
    URLDecode8 = Join(ret, Empty)
End Function

'根据首字节判断UTF-8的字节数
Private Function UTF8Bytes(ByVal u)
    If u > &H00 And u <= &H7F Then
        UTF8Bytes = 1
    ElseIf u >= &HC0 And u <= &HDF Then
        UTF8Bytes = 2
    ElseIf u >= &HE0 And u <= &HEF Then
        UTF8Bytes = 3
    ElseIf u >= &HF0 And u <= &HF7 Then
        UTF8Bytes = 4
    ElseIf u >= &HF8 And u <= &HFB Then
        UTF8Bytes = 5
    ElseIf u >= &HFC And u <= &HFD Then
        UTF8Bytes = 6
    Else
        UTF8Bytes = 0
    End If
End Function

这个URLDecode8可以将UTF-8 URL编码的字符串转为本地字符集的字符串
-----------------------------------
补遗:

有人有疑问,说<%@language="vbscript" codepage="65001"%>的时候,怎么可以用Response.Write输出UTF-8编码的字符串,可以告诉大家的是, codepage=65001时,所有BSTR都是Unicode的,可以用
Response.Write Asc(Mid(一个UTF-8编码的中文, 1, 1))
看看,它的ANSI码是0或1,只有
Response.Write AscW(Mid(一个UTF-8编码的中文, 1, 1))
才可以看到中文的Unicode编码,UTF-8字符其实用Unicode编码存储了,输出的时候才转UTF-8编码

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值