VBA抓取规划局规划公示信息


这次抓取涉及到动态参数的获取,代码有点多。具有动态参数的网页大多是aspx网页
Sub 下载天津市规划局规划()
    Dim strurl$, i%, n%, arr(), b() As Byte
    For i = 1 To 110  '定义提取的页码
        strurl = "http://gh.tj.gov.cn/newslist.aspx?id=CK0401"
        With CreateObject("MSXML2.XMLHTTP")
            '第一次GET,获取动态参数VIEWSTATE和EVENTVALIDATION
            .Open "GET", strurl, False
            .send
            strText = .responseText
            VIEWSTATE = encodeURI(CStr(Split(Split(strText, "__VIEWSTATE"" value=""")(1), """ />")(0)))
            EVENTVALIDATION = encodeURI(CStr(Split(Split(strText, "__EVENTVALIDATION"" value=""")(1), """ />")(0)))
            strText = .responseText
            VIEWSTATE = encodeURI(CStr(Split(Split(strText, "__VIEWSTATE"" value=""")(1), """ />")(0)))
            EVENTVALIDATION = encodeURI(CStr(Split(Split(strText, "__EVENTVALIDATION"" value=""")(1), """ />")(0)))
            '这里的翻页动作是POST提交类型,将取得的动态参数写入需要send发送的参数中。
            .Open "POST", strurl, False
            .setRequestHeader "Content-Type", "application/x-www-form-urlencoded"
            .send "&__EVENTARGUMENT=" & i _
                & "&__EVENTTARGET=AspNetPager1" _
                & "&__EVENTVALIDATION=" & EVENTVALIDATION _
                & "&__VIEWSTATE=" & VIEWSTATE _
                & "&__VIEWSTATEGENERATOR=14DD91A0" _
                & "&AspNetPager1_input=" & i & "-1" _
                & "&HiddenFieldPageFinished=1" _
                & "&pkid=CK0401" _
                & "&pkid2=3" _
                & "&newskindid=CK0401" _
                & "&Left1$ddl_cname=CK" _
                & "&Left1$tb_search=" _
                & "&Left1$rbl_site=title"
            strText = .responseText
            '正则获取单个规划的网址信息
            Open ThisWorkbook.Path & "\图片\1.txt" For Output As #1
            Print #1, strText
            Close
            Set reg = CreateObject("vbscript.regexp")
            reg.Global = True
            reg.IgnoreCase = True
            reg.MultiLine = True
            reg.Pattern = "<a href='(news.aspx\?id=\d+)'>(.*?)<\/a><\/td>\s*<td align=""right"" >(\d+-\d+-\d+)</td>"
            n = 0
            For Each mat In reg.Execute(strText)
                n = n + 1
                ReDim Preserve arr(1 To 3, 1 To n)
                arr(1, n) = "http://gh.tj.gov.cn/" & mat.SubMatches(0) '正则取出的网址
                arr(2, n) = mat.SubMatches(1)   '正则取出的单项规划
                arr(3, n) = mat.SubMatches(2) '正则取出的规划公示时间
            Next mat
            brr = Application.Transpose(arr)
            rrow = ActiveSheet.Cells(Rows.Count, "a").End(3).Row + 1
            ActiveSheet.Range("a" & rrow).Resize(UBound(brr), 3) = brr
            '循环打开单个规划网址,保存图形文件
            Set xml = CreateObject("MSXML2.XMLHTTP")
            For r = 1 To UBound(brr)
                xml.Open "GET", brr(r, 1), False
                xml.send
                Do While xml.ReadyState <> 4
                    DoEvents
                Loop
                strr = xml.responseText
                reg.Pattern = "\/Files\/image\/\d+\.jpg"
                If reg.Test(strr) Then   '保存网页图片
                k = 0
                For Each mat In reg.Execute(strr)
                    Set xmlhttp = CreateObject("MSXML2.XMLHTTP")
                    k = k + 1
                    xmlhttp.Open "GET", "http://gh.tj.gov.cn" & mat, False
                    xmlhttp.send
                    Do While xmlhttp.ReadyState <> 4
                        DoEvents
                    Loop
                    b = xmlhttp.responseBody
                    On Error Resume Next  '排除文件名过长的图片
                    Open "C:\图片\" & brr(r, 2) & k & ".jpg" For Binary As #1
                    Put #1, , b
                    Close
                Next
            Else
            End If
        Next
    End With
Next
MsgBox "完成"
End Sub

Function encodeURI(strText As String) As String
With CreateObject("msscriptcontrol.scriptcontrol")
    .Language = "JavaScript"
    encodeURI = .Eval("encodeURIComponent('" & strText & "');")
End With
End Function




说几个知识点:① encodeURI函数,是我们自己定义的转码函数。②匹配汉字和数字结合的正则表达式写法为:.*?

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值