参考文献:UTF-8 and Unicode FAQ
Unicode和UTF-8的对应关系
U-00000000 - U-0000007F: 0xxxxxxx
U-00000080 - U-000007FF: 110xxxxx 10xxxxxx
U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx
U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1、本地字符集编码的字符串转UTF-8字节串(字节数组),字符串转字节串是有原因的,因为UTF-8编码是多字节的,最多有6个,ASP中的String是BSTR类型的,即双字节,不能将一个超过2个字节的UTF-8字符装入到一个BSTR字符中,所以手工转换,就只能用字节串,当然最好转成字节数组,即Byte(),第2节就讲解本地字符编码的字符串转UTF-8字节数组
'将数字lValue左移iBit位
Public Function LShift(ByVal lValue, ByVal iBit)
LShift = lValue * (2 ^ iBit)
End Function
'将数字lValue右移iBit位
Public Function RShift(ByVal lValue, ByVal iBit)
RShift = lValue / (2 ^ iBit)
End Function
Public Function ANSIToUTF8(ByVal strData)
Dim ret
Dim i, u, b1, b2, b3
For i = 1 To Len(strData)
u = AscW(Mid(strData, i, 1)) And &HFFFF& '获取本地字符的Unicode编码
If u < 128 Then'小于128的UTF-8字节和Ascii字节一样的
ret = ret & ChrB(u)
ElseIf u < 2048 Then'小于2048的Unicode能转成两个字节的UTF-8
'&H80的二进制为: 10000000
'&H3F的二进制为: 00111111
'&HC0的二进制为: 11000000
'&H1F的二进制为: 00011111
'用位运算把UTF-8字节取出来
b2 = &H80 Xor (u And &H3F)
u = RShift(u, 6)
b1 = &HC0 Xor (u And &H1F)
ret = ret & ChrB(b1) & ChrB(b2)
Else'这里默认是转三个字节的UTF-8,因为现在我们大都是用0x00000000到0x0000FFFF这个段的Unicode编码,更高段的现在还没遇见过,遇见了再说
'&HE0的二进制为: 11100000
'&H0F的二进制为: 00001111
b3 = &H80 Xor (u And &H3F)
u = RShift(u, 6)
b2 = &H80 Xor (u And &H3F)
u = RShift(u, 6)
b1 = &HE0 Xor (u And &H0F)
ret = ret & ChrB(b1) & ChrB(b2) & ChrB(b3)
End If
Next
ANSIToUTF8 = ret
End Function
'输出的时候这样用
Response.BinaryWrite ANSIToUTF8("大家好啊")
2、本地字符集编码的字符串转字节数组,ASP里面字符串转数组只能依赖组件,因为ASP的变量类型都是Variant的,其他类型都是Variant的子类型,一般转编码,我们都用ADODB.Stream,这个跟流有关的组件非常好用,我现在ASP中的output基本摒弃了Response.Write方法,直接用ADODB.Stream,它既能转编码又能存储字符,可以当Response.Buffer = True的时候缓存字符串用,基本是这样的
Public Function IIf(ByVal blnExp, vtTrue, vtFalse)
If blnExp Then
IIf = vtTrue
Else
IIf = vtFalse
End If
End Function
Class ImplIO
Private objStream
Private lngCodePage
Private Sub Class_Initialize()
Set objStream = Nothing
lngCodePage = 936
End Sub
Public Property Let CodePage(ByVal lngData)
lngCodePage = lngData
If Not objStream Is Nothing Then
Err.Raise vbObjectError + 1, "ImplIO.CodePage", "不能在调用Write方法后再设置CodePage"
End If
If lngCodePage = 65001 Then
Response.CharSet = "UTF-8"
Else
Response.CharSet = "GBK"
End If
End Property
Private Sub Class_Terminate()
If objStream Is Nothing Then Exit Sub
If objStream.state = &H00000001 Then'adStateOpen
objStream.Position = 0
If lngCodePage = 65001 Then
objStream.Type = 1'adTypeBinary
objStream.Read 3'去掉UTF-8 BOM签名
Response.BinaryWrite objStream.Read(-1)
Else
Response.Write objStream.ReadText(-1)
End If
objStream.Close
End If
Set objStream = Nothing
End Sub
Public Sub Write(vtData)
If objStream Is Nothing Then
Set objStream = Server.CreateObject("ADODB.Stream")
objStream.Type = 2'adTypeText
objStream.Mode = 3'adModeReadWrite
objStream.CharSet = IIf(lngCodePage = 65001, "UTF-8", "GBK")
objStream.Open
End If
objStream.WriteText vtData
End Sub
End Class
Dim IO
Set IO = New ImplIO
IO.CodePage = 65001
IO.Write "大家好啊"
Set IO = Nothing'这里输出Write的字符
3,接下来就是UTF-8的URL编码和解码
URL编码跟字符串转字节串是一个道理,将ANSIToUTF8稍做更改即可
'判断是否是数字字节
Public Function isdigit(ByVal iValue)
isdigit = CBool(iValue >= 48 And iValue <= 57)
End Function
'判断是否是字母字节
Public Function isalpha(ByVal iValue)
isalpha = CBool((iValue >= 65 And iValue <= 90) Or (iValue >= 97 And iValue <= 122))
End Function
''判断是否是数字或字母字节
Public Function isalnum(ByVal iValue)
isalnum = CBool(isdigit(iValue) Or isalpha(iValue))
End Function
Public Function MyHex(ByVal iValue)
If iValue < 16 Then
MyHex = "0" & Hex(iValue)
Else
MyHex = Hex(iValue)
End
End Function
Public Function URLEncode8(ByVal strData)
Dim ret
Dim i, k, u, l
l = Len(strData)
k = 0
ReDim ret(l * 3)
For i = 1 To l
u = AscW(Mid(strData, i, 1)) And &HFFFF& '获取本地字符的Unicode编码
If u < 128 Then'小于128的字节和Ascii字节一样的
If isalnum(u) Then'如果是数字和字母,则用原文
ret(k) = Chr(u)
ElseIf u = 32 Then
ret(k) = "+"
Else
ret(k) = "%" & MyHex(u)
End If
k = k + 1
ElseIf u < 2048 Then'小于2048的Unicode能转成两个字节的UTF-8
ret(k + 1) = "%" & MyHex(&H80 Xor (u And &H3F))
u = RShift(u, 6)
ret(k) = "%" & MyHex(&HC0 Xor (u And &H1F))
k = k + 2
Else'这里默认是转三个字节的UTF-8,因为现在我们大都是用0x00000000到0x0000FFFF这个段的Unicode编码,更高段的现在还没遇见过,遇见了再说
ret(k + 2) = "%" & MyHex(&H80 Xor (u And &H3F))
u = RShift(u, 6)
ret(k + 1) = "%" & MyHex(&H80 Xor (u And &H3F))
u = RShift(u, 6)
ret(k) = "%" & MyHex(&HE0 Xor (u And &H0F))
k = k + 3
End If
Next
URLEncode8 = Join(ret, Empty)
End Function
'输出URLEncode8("大家好啊")的结果是%E5%A4%A7%E5%AE%B6%E5%A5%BD%E5%95%8A
'UTF-8的URL解码就将函数倒过来即可
Public Function URLDecode8(ByVal strData)
Dim arr, ret
Dim i, k, l, u, b1, b2, b3
arr = Split(Replace(strData, "+", " "), "%")
l = UBound(arr)
k = 1
ReDim ret(l)
ret(0) = arr(0)
For i = 1 To UBound(arr)
u = CInt("&H" & Mid(arr(i), 1, 2))
If u > 127 Then
b1 = u
Select Case UTF8Bytes(b1)
Case 2
b2 = CInt("&H" & Mid(arr(i + 1), 1, 2))
i = i + 1
u = LShift(b1 And &H3F, 6) Or LShift(b2 And &H3F, 0)
Case 3
b2 = CInt("&H" & Mid(arr(i + 1), 1, 2))
b3 = CInt("&H" & Mid(arr(i + 2), 1, 2))
i = i + 2
u = LShift(b1 And &H0F, 12) Or LShift(b2 And &H3F, 6) Or LShift(b3 And &H3F, 0)
Case Else
Err.Raise vbObjectError + 1, "URLDecode8", "不支持的UTF-8编码"
End Select
End If
ret(k) = ChrW(u) & Mid(arr(i), 3)
k = k + 1
Next
URLDecode8 = Join(ret, Empty)
End Function
'根据首字节判断UTF-8的字节数
Private Function UTF8Bytes(ByVal u)
If u > &H00 And u <= &H7F Then
UTF8Bytes = 1
ElseIf u >= &HC0 And u <= &HDF Then
UTF8Bytes = 2
ElseIf u >= &HE0 And u <= &HEF Then
UTF8Bytes = 3
ElseIf u >= &HF0 And u <= &HF7 Then
UTF8Bytes = 4
ElseIf u >= &HF8 And u <= &HFB Then
UTF8Bytes = 5
ElseIf u >= &HFC And u <= &HFD Then
UTF8Bytes = 6
Else
UTF8Bytes = 0
End If
End Function
这个URLDecode8可以将UTF-8 URL编码的字符串转为本地字符集的字符串
-----------------------------------
补遗:
有人有疑问,说<%@language="vbscript" codepage="65001"%>的时候,怎么可以用Response.Write输出UTF-8编码的字符串,可以告诉大家的是, codepage=65001时,所有BSTR都是Unicode的,可以用
Response.Write Asc(Mid(一个UTF-8编码的中文, 1, 1))
看看,它的ANSI码是0或1,只有
Response.Write AscW(Mid(一个UTF-8编码的中文, 1, 1))
才可以看到中文的Unicode编码,UTF-8字符其实用Unicode编码存储了,输出的时候才转UTF-8编码
UTF-8全攻略
最新推荐文章于 2020-12-02 10:59:13 发布