因为进不去数据库今天终于需要实战VBA网页爬虫了。370条记录,用时三分钟。想说其实挺慢的。以后慢慢改进吧。抓下来之后采用Text to Columns 用着刚刚好。
2015-11-13: 刚刚瞥到一眼,同学你那个object能不能在循环外面创建啊?!智商捉急啊!
Sub Crawler()
Dim xmlhttp As Object
Dim strURL As String
Dim i As Integer
Dim rowNum As Integer
Dim Content As String
Dim key As String
rowNum = Sheet1.UsedRange.Rows.Count
For i = 2 To rowNum
strURL = "http://www.fake.com/Id=" & Sheet1.Cells(i, 2).Value & "&effectiveDate=" & Sheet1.Cells(i, 4)
Set xmlhttp = CreateObject("Microsoft.XMLHTTP")
xmlhttp.Open "GET", strURL, False
xmlhttp.send
Content = xmlhttp.responsetext
arr1 = Split(Content, "<Holding>") 'You cannot dim arr1() at the beginning
arr2 = Split(arr1(1), "</Holding>")
Sheet3.Cells(i, 5) = arr2(0)
Set xmlhttp = Nothing
Next i
End Sub
2016-10-13: 发现这篇点击率还有点高哇。再写两笔。其实爬虫本身更多结合的是前端的东西,然后就是要了解那些对象。
上一小段代码:它的功能是IE打开网页,然后自动登录,然后一个单句关于抓东西的。
Sub JiraMonthlyRun()
Dim IE As Object
Dim Tsht, Osht As Worksheet
Set Tsht = ActiveWorkbook.Worksheets.Add(after:=Sheets(Sheets.Count))
Tsht.Name = "ComponentByStatus"
Dim PUrl, ele, sk1, sk2, skey As String
Dim tStrings(0 To 14) As String
Dim position, pos, kk, pos1, pos2, flag, mark As Integer
flag = 0
mark = 0
PUrl = "https://word.com/secure/RapidBoard.jspa?rapidView=1371&projectKey=MRDPM"
JiraRun.Show
Application.ScreenUpdating = False
Do While True
Set IE = CreateObject("InternetExplorer.Application")
If flag = 0 Then
With IE
.Visible = False
.navigate PUrl
Do Until .ReadyState = 4 'Complete
DoEvents
Loop
If (.document.querySelectorAll("div.aui-page-panel-inner") = "") Then
Do Until (.document.querySelectorAll("div.ghx-qty").Length > 0)
WaitToIEReady IE
Loop
GoTo Content
End If
'登陆部分
.document.getElementById("login-form-username").Value = JiraRun.User.Text
.document.getElementById("login-form-password").Value = JiraRun.Pw.Text
.document.getElementById("login-form-submit").Click
Do Until .ReadyState = 4 'Complete
DoEvents
Loop
Do Until (.document.querySelectorAll("div.ghx-qty").Length > 0)
WaitToIEReady IE
Loop
End With
ele = IE.document.getElementById("ghx-column-headers").innerHTML '抓东西
IE.Quit