vba爬静态网页

28 篇文章 2 订阅

原贴地址:http://club.excelhome.net/thread-1466658-1-1.html?tdsourcetag=s_pcqq_aiomsg

  1. 日文环境下BytesToBstr和IsUTF8可能需要修改编码类型。
  2. HTMLFILE对象无法使用execScript和querySelector,原因不明。
  3. HTMLFILE对象用来解析html,也可以直接用正则表达式解析BytesToBstr。
  4. IsUTF8里:UTF-8文本文件与Unicode文本文件类似,在文件的头部也有标记字节,Unicode文件的标记是2个字节:&HFF 和 &HFE,UTF-8文件的标记是3个字节:&HEF、&HBB 和 &HBF。
Public Function GetWebTxt(ByVal url As String) As String
    Dim xmlHttp As Object
    Application.DisplayAlerts = False
    Set xmlHttp = CreateObject("MSXML2.XMLHTTP")
    xmlHttp.Open "GET", url, False
    'xmlHttp.setRequestHeader "Content-Type", "text/xml"
    xmlHttp.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 6.1; rv:25.0) Gecko/20100101 Firefox/25.0"
    xmlHttp.Send
    While xmlHttp.readystate <> 4
        DoEvents            '''let system operator do other thing '''转让控制权,以便让操作系统处理其它事件。
    Wend
    GetWebTxt = BytesToBstr(xmlHttp.responsebody)
    'GetWebTxt = xmlHttp.responsetext			'''这个也能拿到文本,也许可能出现编码问题
End Function

'将字节转换为字符串
Public Function BytesToBstr(Bytes, Optional ByVal encode = "")
    If encode = Empty Then
        If IsUTF8(Bytes) Then '如果不是UTF-8编码则按照GB2312来处理
            encode = "UTF-8"
        Else
            encode = "GB2312"
        End If
    End If

    'Dim objstream As ADODB.Stream			'sometimes error,so....
    Set objstream = CreateObject("ADODB.Stream")
    With objstream
        .Type = 1
        .Mode = 3
        .Open
        .Write Bytes
        .Position = 0
        .Type = 2
        .Charset = encode
        BytesToBstr = .ReadText
       .Close
    End With
    Debug.Print BytesToBstr
End Function

'判断网页编码函数
Public Function IsUTF8(Bytes) As Boolean
    Dim i As Long, AscN As Long, Length As Long
    Length = UBound(Bytes) + 1

    If Length < 3 Then
        IsUTF8 = False
        Exit Function
    ElseIf Bytes(0) = &HEF And Bytes(1) = &HBB And Bytes(2) = &HBF Then
        IsUTF8 = True
        Exit Function
    End If

    Do While i <= Length - 1
        If Bytes(i) < 128 Then
            i = i + 1
            AscN = AscN + 1
        ElseIf (Bytes(i) And &HE0) = &HC0 And (Bytes(i + 1) And &HC0) = &H80 Then
            i = i + 2

        ElseIf i + 2 < Length Then
            If (Bytes(i) And &HF0) = &HE0 And (Bytes(i + 1) And &HC0) = &H80 And (Bytes(i + 2) And &HC0) = &H80 Then
                 i = i + 3
            Else
                IsUTF8 = False
                Exit Function
            End If
        Else
            IsUTF8 = False
            Exit Function
        End If
    Loop

    If AscN = Length Then
        IsUTF8 = False
    Else
        IsUTF8 = True
    End If
End Function

Sub aa()
    url = "http://club.excelhome.net/thread-1466658-1-1.html?tdsourcetag=s_pcqq_aiomsg"
    'Dim html As HTMLDocument                   '''if do not zhushi will error
    Set html = CreateObject("HTMLFILE")         '''tool->references-> add Microsoft HTML Object Library
    html.designMode = "on"                      '''open edit mode,else security warning window alert

    strHtml = GetWebTxt(url)
    html.Write strHtml ' 写入数据
    Set newspecial = html.getElementById("newspecial")
    
    'Set oWindow = html.parentWindow
    'oWindow.execScript "var t=3"                '''jujue fangwen
    't = oWindow.t                               '''get value from javaScript

    'Set ele = html.querySelector(".fastlg_l")   '''do not support this method,why??
    'ele.innerhtml = "fa xin tie innerhtml"
    'ele.Click
End Sub

POST请求

Public Sub test()
    Dim objxml As Object
    Set objxml = CreateObject("MSXML2.XMLHTTP")
    objxml.Open "POST", "http://fanyi.youdao.com/translate", False
    objxml.setrequestheader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
    objxml.Send "i=good&doctype=json"
    Do While objxml.readyState <> 4
        DoEvents
    Loop
    Dim strresponse As String
    strresponse = objxml.responsetext
    Debug.Print strresponse
    MsgBox Split(Split(strresponse, "tgt"":""")(1), """")(0)
End Sub

带COOKIE的请求(可模拟用户登陆后的操作)

在浏览器登录后,把cookie拷进来就可以了

Public Sub test()
    Set objxml = CreateObject("MSXML2.XMLHTTP")
    objxml.Open "GET", "https://www.mouCGwangzhan.com/u/vXyzJqqWk/post", False
    objxml.SetRequestHeader "Content-Type", "application/x-www-form-urlencoded; charset=UTF-8"
    objxml.SetRequestHeader "Cookie", "PHPSESSID=08p6agdv3r5n9cjuufo2m9fpbf; Hm_lvt_49cc02c62fdb09c867c9340508d5af34=1580715815; Hm_lpvt_49cc02c62fdb09c867c9340508d5af34=1581524573; Hm_lvt_42a3dbb921624d3ca0fdb89f6b127f00=1586962700; wordpress_logged_in_9fd0deecc164ee14cba65d9679ee6e01=0b39b362bd%7C1589284459%7CdAW7iCxwNEz08AYzqDHPQT8yciBd1AUJAgeHtKIlhpv%7Cd7e6810e09d8c85fc187d0db485239e87809376fc5e93b550fcf6360eb68e0af; Hm_lpvt_42a3dbb921624d3ca0fdb89f6b127f00=1588317461"
    objxml.Send
    Do While objxml.readyState <> 4
        DoEvents
    Loop
    Dim strresponse As String
    strresponse = objxml.responsetext
    Debug.Print strresponse
    Call write2TextFile(strresponse, "C:\Users\Administrator\Desktop\1.txt")
End Sub
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值