VB分析超过64K的网页内容(基于XMLHTTP和字节数组处理)

Visual Basic Code
'****************************************************************************************************
'
'Name.......... WEB Page Read Program
'File.......... WEBRead.frm
'Version....... 1.0.0
'Dependencies.. XMLHTTP Object
'Description... Dynamic read URL html data
'Author........ Zhou Wen Xing 
'Date.......... Nov, 5nd 2010
'CSDN Accounts..SupermanKing
'
'Copyright (c) 2008 by www.rljy.com
'LiuZhou city, China
'
'****************************************************************************************************
'====================================================================================================
' API function defining ( API 函数定义 )
'====================================================================================================
Private Declare Sub CopyMemory Lib "kernel32" Alias "RtlMoveMemory" ( _
                                         Destination As Any, _
                                         Source As Any, _
                                         ByVal Length As Long)

'====================================================================================================
'  Form event dispose process ( 窗体基本的事件处理过程 )
'====================================================================================================
'==================== 点击按钮1产生的事件 ====================
Private Sub Command1_Click()
    '==================== 变量定义 ====================
     Dim strTemp         As String                               ' 临时字符串变量
     Dim strUserList     As String                               ' 最终拼合用户列表的变量
     Dim strSearch       As String                               ' 搜索关键内容的字符串变量
     Dim lngSearchSize   As Long                                 ' 搜索关键内容的字符串大小
     Dim lngStart        As Long                                 ' 搜索用户字符串时存储开始位置的变量
     Dim lngEnd          As Long                                 ' 搜索用户字符串时存储结束位置的变量
     Dim ComXMLHTTP      As Object                               ' 访问网页的 XMLHTTP 对象
     Dim byteHTML()      As Byte                                 ' 存储网页内容的字节流数组变量

     On Error Resume Next                                        ' 开始设置错误陷阱,防止程序发生意外错误而崩溃
    '==================== 初始化变量 ====================
     strUserList = ""
     strSearch = "class=""dropmenu"" οnmοuseοver=""showMenu(this.id)"">"
     lngSearchSize = LenB(StrConv(strSearch, vbFromUnicode))

    '==================== 开始下载指定 URL 的数据内容 ====================
     Set ComXMLHTTP = CreateObject("Microsoft.XMLHTTP")                              '初始化 XMLHTTP 对象
     If Err.Number <> 0 Then
         MsgBox "错误:" & Err.Number & "," & Err.Description
         Err.Clear
         Exit Sub
     End If
     ComXMLHTTP.Open "GET", "http://bbs.duowan.com/thread-17408898-2-1.html", False  '设置访问方式和URL地址
     ComXMLHTTP.setRequestHeader "CONTENT-TYPE", "application/x-www-form-urlencoded" '向HTTP头加入参数
     ComXMLHTTP.Send                                                                 '提交HTTP请求
     If Err.Number <> 0 Then
         MsgBox "错误:" & Err.Number & "," & Err.Description
         Err.Clear
         Exit Sub
     End If
    '---------- 判断下载是否成功 ----------
     If ComXMLHTTP.Status <> 200 Then
         MsgBox "访问URL失败,请您确定。", 64, "提示"
         Exit Sub
     End If
    '==================== 下载 URL 的数据完成后将数据读入字节数组中 ====================
    '---------- 将数据读入 byteHTML 这个字节数组中 ----------
    ' 因为该网页原来是 UTF-8 编码,所以取得的数据也就是 UTF-8 的编码数据
     byteHTML = ComXMLHTTP.ResponseBody
     Call SaveTextFile("c:/UTF-8.txt", byteHTML, "UTF-8")        ' 保存原始数据到磁盘,可以验证数据的完整性

    '---------- 将 UTF-8 编码的字节数组转换成 Unicode 编码的字节数组 ----------
     byteHTML = UTF8ToUnicode(byteHTML)
     Call SaveTextFile("c:/Unicode.txt", byteHTML, "Unicode")    ' 保存转换 Unicode 后的数据到磁盘,可以验证数据的完整性

    '---------- 将 Unicode 编码的字节数组转换成 GB2312 编码的字节数组 ----------
    ' 其转换目的是方便用 GB2312 的字符串查找数据,当然直接用 Unicode 也是可以实现的
     byteHTML = UnicodeToGB2312(byteHTML)
     Call SaveTextFile("c:/GB2312.txt", byteHTML)                ' 保存转换 GB2312 后的数据到磁盘,可以验证数据的完整性


    '==================== 得到完整的 GB2312 编码数组数据后,开始分析网页内容 ====================
    ' 第一个找到的被忽略,因为这个不是所需的内容
     lngStart = InStr_Array(0, byteHTML, strSearch)
    ' 如果一个都找不到,就没必要继续下去了
     If lngStart >= 0 Then
         lngStart = lngStart + lngSearchSize
        '---------- 开始循环查找所有用户内容 ----------
         Do
            ' 这里开始才是要找的东西位置
             lngStart = InStr_Array(lngStart, byteHTML, strSearch)
             If lngStart >= 0 Then
                 lngStart = lngStart + lngSearchSize
                 lngEnd = InStr_Array(lngStart, byteHTML, "")
                 strTemp = Mid_Array(byteHTML, lngStart, lngEnd - lngStart)
                 lngStart = lngEnd
                 strUserList = strUserList & strTemp & vbCrLf
             End If
         Loop While lngStart >= 0
     End If
    '==================== 完成工作将用户信息合并内容输出到文本框 ====================
     Text1.Text = strUserList
End Sub

'====================================================================================================
' User in the class custom's funtion dispose process ( 自定义函数及处理过程 )
'====================================================================================================
'----------------------------------------------------------------------------------------------------
'  Function   Name:  UTF8ToUnicode
'  Input Parameter:  funUTF8(Byte Array)        - The UTF-8's byte array
'  Return    Value:  (Byte Array)               - Return Unicode's byte array
'  Description    :  Visual Basic compile's conversion the UTF-8 to Unicode dispose process
'  Author         :  SupermanKing
'----------------------------------------------------------------------------------------------------
Function UTF8ToUnicode(ByRef funUTF8() As Byte) As Byte()
    '==================== 变量定义 ====================
     Dim lngLength       As Long
     Dim lngLengthB      As Long
     Dim lngUTF8Char     As Long
     Dim intWChar        As Integer
     Dim byteTemp        As Byte
     Dim byteBit         As Byte
     Dim byteUnicode()   As Byte
     Dim lngUTF8Count    As Long
     Dim i               As Long
     Dim j               As Long

     On Error Resume Next                                        ' 开始设置错误陷阱,防止程序发生意外错误而崩溃
    '==================== 初始化变量 ====================
     lngLengthB = 0

    '==================== 校验输入参数 ====================
     lngLength = UBound(funUTF8) + 1
     If Err.Number <> 0 Then
         Err.Clear
         Exit Function
     End If

    '==================== 开始循环处理编码转换过程 ====================
     For i = 0 To lngLength - 1
        '-------------------- 根据 UTF-8 编码规则数 UTF-8 字符的存储个数 --------------------
         lngUTF8Count = 0
         byteTemp = funUTF8(i)
         For j = 1 To 7
             byteBit = Int(byteTemp / (2 ^ (8 - j)))     '二进制位向右偏移 (8 - j) 个二进制位
             byteBit = byteBit And 1                     '取最后一个二进制位值
             If byteBit = 1 Then
                 lngUTF8Count = lngUTF8Count + 1
             Else
                '碰到0就结束数字符数操作
                 Exit For
             End If
         Next j

        '-------------------- 判断编码内存储的内容是否是经过编码的 --------------------
         If lngUTF8Count < 2 Or lngUTF8Count > 3 Then
            '---------- 没有经过 UTF-8 格式编码,直接转换成 Unicode 编码 ----------
             If lngLengthB = 0 Then
                 lngLengthB = 2
                 ReDim byteUnicode(lngLengthB - 1)
             Else
                 lngLengthB = lngLengthB + 2
                 ReDim Preserve byteUnicode(lngLengthB - 1)
             End If
             byteUnicode(lngLengthB - 2) = byteTemp
         Else
            '---------- 经过 UTF-8 格式编码,先读出内容后再转换成 Unicode 编码 ----------
            ' 读出这几个UTF-8字节内容
             For j = 0 To lngUTF8Count - 1
                 byteTemp = funUTF8(i + j)
                 If j = 0 Then
                    '第一个UTF-8编码含编码字节信息,所以取存储信息特别点
                     byteTemp = byteTemp And ((2 ^ (8 - (lngUTF8Count + 1))) - 1)
                     lngUTF8Char = byteTemp
                 Else
                    '后面的只取6个二进制位
                     byteTemp = byteTemp And &H3F
                     lngUTF8Char = lngUTF8Char * &H40        '向左偏移6位好存储后面的6位数据
                     lngUTF8Char = lngUTF8Char Or byteTemp   '将低6位的数据补充到编码中
                 End If
             Next j
            ' 已经取出Unicode编码到 lngUTF8Char 里
             If lngLengthB = 0 Then
                 lngLengthB = 2
                 ReDim byteUnicode(lngLengthB - 1)
             Else
                 lngLengthB = lngLengthB + 2
                 ReDim Preserve byteUnicode(lngLengthB - 1)
             End If
             byteUnicode(lngLengthB - 2) = lngUTF8Char And 255
             byteUnicode(lngLengthB - 1) = Int(lngUTF8Char / (2 ^ 8)) And 255
             i = i + (lngUTF8Count - 1)
         End If
         If i > (lngLength - 1) Then
             Exit For
         End If
     Next i

    '==================== 完成编码转换过程,返回数据 ====================
     UTF8ToUnicode = byteUnicode
End Function

'----------------------------------------------------------------------------------------------------
'  Function   Name:  UnicodeToGB2312
'  Input Parameter:  funUnicode(Byte Array)     - The Unicode's byte array
'  Return    Value:  (Byte Array)               - Return GB2312's byte array
'  Description    :  Visual Basic compile's conversion the Unicode to GB2312 dispose process
'  Author         :  SupermanKing
'----------------------------------------------------------------------------------------------------
Function UnicodeToGB2312(ByRef funUnicode() As Byte) As Byte()
    '==================== 变量定义 ====================
     Dim lngLength       As Long
     Dim lngLengthB      As Long
     Dim byteGB2312()    As Byte
     Dim i               As Long
     Dim intWChar        As Integer
     Dim intChar         As Integer

     On Error Resume Next                                        ' 开始设置错误陷阱,防止程序发生意外错误而崩溃
    '==================== 初始化变量 ====================
     lngLengthB = 0

    '==================== 校验输入参数 ====================
     lngLength = UBound(funUnicode) + 1
     If Err.Number <> 0 Then
         Err.Clear
         Exit Function
     End If
     lngLength = lngLength / 2

    '==================== 开始循环处理编码转换过程 ====================
     For i = 0 To lngLength - 1
         CopyMemory intWChar, funUnicode(i * 2), 2
         intChar = Asc(StrConv(ChrW(intWChar), vbNarrow))
         If intChar < 0 Or intChar > 255 Then
             If lngLengthB = 0 Then
                 lngLengthB = 2
                 ReDim byteGB2312(lngLengthB - 1)
                 byteGB2312(lngLengthB - 1) = intChar And 255
                 byteGB2312(lngLengthB - 2) = Int(CLng("&H" & Hex(intChar)) / (2 ^ 8)) And 255
             Else
                 lngLengthB = lngLengthB + 2
                 ReDim Preserve byteGB2312(lngLengthB - 1)
                 byteGB2312(lngLengthB - 1) = intChar And 255
                 byteGB2312(lngLengthB - 2) = Int(CLng("&H" & Hex(intChar)) / (2 ^ 8)) And 255
             End If
         Else
             If lngLengthB = 0 Then
                 lngLengthB = 1
                 ReDim byteGB2312(lngLengthB - 1)
                 byteGB2312(lngLengthB - 1) = CByte(intChar)
             Else
                 lngLengthB = lngLengthB + 1
                 ReDim Preserve byteGB2312(lngLengthB - 1)
                 byteGB2312(lngLengthB - 1) = CByte(intChar)
             End If
         End If
     Next i

    '==================== 完成编码转换过程,返回数据 ====================
     UnicodeToGB2312 = byteGB2312
End Function

'----------------------------------------------------------------------------------------------------
'  Function   Name:  InStr_Array
'  Input Parameter:  funStart(Long)             - Search the byte array start's address
'                 :  funBytes(Byte Array)       - Want search data's byte array
'                 :  funFind(String)            - Search's qualification
'  Return    Value:  (Long)                     - Find qualification's address
'  Description    :  Imitate InStr function's dispose process
'  Author         :  SupermanKing
'----------------------------------------------------------------------------------------------------
Function InStr_Array(ByVal funStart As Long, _
                      ByRef funBytes() As Byte, _
                      ByVal funFind As String) As Long
    '==================== 变量定义 ====================
     Dim byteFindArray()     As Byte
     Dim lngBytesCount       As Long
     Dim lngFindCount        As Long
     Dim lngIsFind           As Long
     Dim i                   As Long
     Dim j                   As Long

     On Error Resume Next                                        ' 开始设置错误陷阱,防止程序发生意外错误而崩溃
    '==================== 初始化变量 ====================
     InStr_Array = -1

    '==================== 校验输入参数 ====================
    '---------- 校验搜索条件参数 ----------
     If Len(funFind) = 0 Then
         Exit Function
     End If
    '---------- 校验搜索内容参数 ----------
     lngBytesCount = UBound(funBytes)
     If Err.Number <> 0 Then
         Err.Clear
         Exit Function
     End If
     byteFindArray = StrConv(funFind, vbFromUnicode)
     lngFindCount = UBound(byteFindArray)
    '---------- 校验搜索位置参数 ----------
     If funStart + lngFindCount > lngBytesCount Then
         Exit Function
     End If

    '==================== 开始搜索数据 ====================
     For i = funStart To lngBytesCount
         lngIsFind = 1
         For j = 0 To lngFindCount
             If funBytes(i + j) < &HA0 And byteFindArray(j) < &HA0 Then
                 If UCase(Chr(funBytes(i + j))) <> UCase(Chr(byteFindArray(j))) Then
                     lngIsFind = 0
                     Exit For
                 End If
             Else
                 If funBytes(i + j) <> byteFindArray(j) Then
                     lngIsFind = 0
                     Exit For
                 End If
             End If
         Next j
         If lngIsFind = 1 Then
             InStr_Array = i
             Exit For
         End If
     Next i
End Function

'----------------------------------------------------------------------------------------------------
'  Function   Name:  Mid_Array
'  Input Parameter:  funBytes(Byte Array)       - Want get data's byte array
'                 :  funStart(Long)             - Want get data's start address
'                 :  funCount(Long)             - Want get data's size
'  Return    Value:  (String)                   - Return want get string
'  Description    :  Imitate Mid function's dispose process
'  Author         :  SupermanKing
'----------------------------------------------------------------------------------------------------
Function Mid_Array(ByRef funBytes() As Byte, _
                    ByVal funStart As Long, _
                    ByVal funCount As Long) As String
    '==================== 变量定义 ====================
     Dim byteRead()      As Byte
     Dim lngBytesCount   As Long

     On Error Resume Next                                        ' 开始设置错误陷阱,防止程序发生意外错误而崩溃
    '==================== 初始化变量 ====================
     Mid_Array = ""

    '==================== 校验输入参数 ====================
     lngBytesCount = UBound(funBytes)
     If Err.Number <> 0 Then
         Err.Clear
         Exit Function
     End If
     If funStart + funCount > lngBytesCount Then
         Exit Function
     End If

    '==================== 开始取指定数据内容 ====================
     ReDim byteRead(funCount - 1)
     CopyMemory byteRead(0), funBytes(funStart), funCount
     Mid_Array = StrConv(byteRead, vbUnicode)
End Function

'----------------------------------------------------------------------------------------------------
'  Function   Name:  SaveTextFile
'  Input Parameter:  funFileName(String)        - Save file's path
'                 :  funBytes(Byte Array)       - Save file's data
'                 :  funMode(String)            - Data codeing mode
'  Return    Value:  (void)
'  Description    :  Save .txt file dispose process
'  Author         :  SupermanKing
'----------------------------------------------------------------------------------------------------
Sub SaveTextFile(ByVal funFileName As String, _
                  ByRef funBytes() As Byte, _
                  Optional ByVal funMode As String = "GB2312")
    '==================== 变量定义 ====================
     Dim fs      As Integer

     On Error Resume Next                                        ' 开始设置错误陷阱,防止程序发生意外错误而崩溃
    '==================== 校验输入参数 ====================
    ' 判断给定文件地址是否可读写,同时也可进行文件数据初始化操作
     fs = FreeFile
     Open funFileName For Output As #fs
     If Err.Number <> 0 Then
         MsgBox "错误:" & Err.Number & "," & Err.Description, 16, "错误"
         Err.Clear
         Exit Sub
     End If
     Close #fs

    '==================== 开始写文件数据 ====================
     fs = FreeFile
     Open funFileName For Binary As #fs
    '根据编码模式来写 TXT 文件头,这样可让 Windows 记事本识别该文件的编码方式
     Select Case UCase(funMode)
     Case "GB2312":  '输出 GB2312 编码的文本文件
                     Put #1, 1, funBytes

     Case "UNICODE"'输出 Unicode 编码的文本文件
                     Put #1, 1, CByte(&HFF)
                     Put #1, 2, CByte(&HFE)
                     Put #1, 3, funBytes

     Case "UTF-8":   '输出 UTF-8 编码的文本文件
                     Put #1, 1, CByte(&HEF)
                     Put #1, 2, CByte(&HBB)
                     Put #1, 3, CByte(&HBF)
                     Put #1, 4, funBytes
     End Select
     Close #fs
End Sub

评论 12
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值