'目前存在的BUG
'图片补丁存在多个URL
'题目中间存在小数的问题在正则表达式里加上\d+\D
Private Declare Function URLDownloadToFile Lib "urlmon" Alias "URLDownloadToFileA" (ByVal pCaller As Long, ByVal szURL As String, ByVal szFileName As String, ByVal dwReserved As Long, ByVal lpfnCB As Long) As Long
Private Declare Function DeleteUrlCacheEntry Lib "wininet" Alias "DeleteUrlCacheEntryA" (ByVal lpszUrlName As String) As Long
Sub DownloadImageName(ByVal ImageURL As String, ByVal ImagePath As String)
Dim lngRetVal As Long
lngRetVal = URLDownloadToFile(0, ImageURL, ImagePath, 0, 0)
If lngRetVal = 0 Then
DeleteUrlCacheEntry ImageURL '清除缓存
'MsgBox "成功"
Else
'MsgBox "失败"
End If
End Sub
Sub LoopGetSubject()
Dim StartTime As Variant
Dim UsedTime As Variant
StartTime = VBA.Timer
Dim msg As Variant
msg = MsgBox("Choose 'Yes' to Continue,Choose 'No' to Exit !", vbYesNo, "AuthorQQ 84857038")
If msg = vbNo Then Exit Sub
Dim Sht As Worksheet
Set Sht = ThisWorkbook.ActiveSheet
With Sht
EndRow = .Cells(.Cells.Rows.Count, 1).End(xlUp).Row
For i = 2 To EndRow
SetFontRed .Cells(i, 1).Resize(1, 3)
FindText = Mid(.Cells(i, 3).Text, 4, Len(.Cells(i, 3).Text) - 8)
ExamUrl = .Cells(i, 2).Text
Source = .Cells(i, 1).Text
Source = Replace(Source, "【", "")
Source = Replace(Source, "】", "")
Source = Replace(Source, "解析", "")
Call GetExamTextByUrl(ExamUrl, FindText, Source)
Next i
End With
Set Sht = Nothing
UsedTime = VBA.Timer - StartTime
Debug.Print "UsedTime :" & Format(UsedTime, "#0.0000 Seconds")
MsgBox "UsedTime :" & Format(UsedTime, "#0.0000 Seconds")
End Sub
Sub GetSubject()
SetFontRed Application.ActiveCell
FindText = Mid(Application.ActiveCell.Text, 4, Len(Application.ActiveCell.Text) - 8)
ExamUrl = Application.ActiveCell.Offset(0, -1).Text
Source = Application.ActiveCell.Offset(0, -2).Text
Source = Replace(Source, "【", "")
Source = Replace(Source, "】", "")
Source = Replace(Source, "解析", "")
Call GetExamTextByUrl(ExamUrl, FindText, Source)
End Sub
Sub GetExamTextByUrl(ByVal ExamUrl As String, ByVal FindText As String, ByVal Source As String)
Dim Subject As String
Dim HasImageText As String
Dim Question As String
Dim ImageURL As String
Dim Answer As String
Dim HasGetContent As Boolean
Dim docName As String
Dim docPath As String
Dim Independent As Boolean
Dim IsQuestion As Boolean
Dim IsAnswer As Boolean
Dim oneP As Object
Dim nextTag As Object
'send request
With CreateObject("MSXML2.XMLHTTP")
.Open "GET", ExamUrl, False
.Send
WebText = .responsetext
'Debug.Print WebText
' Stop
End With
With CreateObject("htmlfile")
.write WebText
Set examdiv = .getElementById("sina_keyword_ad_area2")
'获取试卷文本内容
ExamText = examdiv.innerText
'判断试卷是否含有独立答案
Independent = ExamText Like "*参考答案*"
'Debug.Print " Independent "; Independent
'设定搜集题目Word文档名称和路径
docName = Application.ActiveSheet.Name & "_题目搜集.doc"
docPath = ThisWorkbook.Path & "\" & docName
'判断某个段落是否为题目/答案的开始
IsQuestion = False
IsAnswer = False
'判断是否已经提取到内容
HasGetContent = False
'循环所有段落
For Each oneP In .getElementsByTagName("p")
If HasGetContent = False Then
'判断某段内容是否为题号行
'If oneP.innerText Like "##.*" Or oneP.innerText Like "##.*" Then
If RegTest(oneP.innerText, "(\d{1,2})[\..]\D.*") Then
'sep = Right(oneP.innerHTML, 100)
' Debug.Print "''''''''"; oneP.innerHTML
' sep = RealInnerHtml(oneP.innerHTML)
' Debug.Print sep
'Debug.Print InStr(WebText, sep)
' Stop
sep = RegGetLast(oneP.innerHTML, "([\u4e00-\u9fa5]{5,})")
HasImageText = Split(WebText, FindText)(0)
pos = InStrRev(HasImageText, sep)
HasImageText = Mid(HasImageText, pos)
Debug.Print ">>>>>汉字分隔符>>>"; sep
Debug.Print HasImageText
' Debug.Print WebText
'Stop
'Debug.Print ">>>>>>>>"; partText
'Debug.Print "Sep》》》》"; UCase(sep)
'Stop
Subject = ""
Question = ""
ImageURL = ""
Answer = ""
'开始记录题干内容
Subject = oneP.innerText
'Debug.Print OneP.innerText
Else
If InStr(oneP.innerText, FindText) = 0 Then
'过滤不相干的问题,仅保留符合条件的问题
If Not RegTest(oneP.innerText, "([\((]\d[\))]).*") Then
'继续记录问题内容
Subject = Subject & oneP.innerText
End If
End If
End If
'提取题目图片的地址
'直接获取innerhtml
' Set nextTag = oneP.NextSibling
' If Not nextTag Is Nothing Then
' If UCase(nextTag.tagName) = "A" Then
' If nextTag.HasChildNodes Then
' If nextTag.href Like "http://photo.blog.sina.com.cn/showpic.html*" Then
' ImageURL = ImageURL & "|" & nextTag.FirstChild.getAttribute("real_src")
' Debug.Print ImageURL
' End If
' End If
' End If
'End If
'Stop
'提取题目的序号和问题的序号
If InStr(oneP.innerText, FindText) > 0 Then
SubjectIndex = RegGet(Subject, "(\d{1,2})[..]\D.*")
Question = oneP.innerText
questionIndex = RegGet(Question, "[\((](\d)[\))].*")
'Debug.Print "题序:"; SubjectIndex; " 问序: "; questionIndex
HasGetContent = True
End If
Else
'提取内容后 开始找答案
'试卷不含独立答案,答案就附在每道题后面
If Independent = False Then
If IsAnswer = False Then
If RegTest(oneP.innerText, "[\((](" & questionIndex & ")[\))].*") Then
Answer = oneP.innerText
IsAnswer = True
'Exit For
End If
Else
Debug.Print oneP.innerText
If RegTest(oneP.innerText, "[\((](\d)[\))].*") Or RegTest(oneP.innerText, "(\d{1,2})[..]\D.*") Then
Exit For
Else
Answer = Answer & oneP.innerText
End If
End If
Else
'试卷还有独立参考答案
'判断某段内容的题号是否符合条件
If RegTest(oneP.innerText, "(" & SubjectIndex & ")[\..].\D*") Then
IsQuestion = True
'Debug.Print isQuestion
End If
If IsQuestion = True Then
'判断某段内容的问题序号是否符合条件
If IsAnswer = False Then
If RegTest(oneP.innerText, "([\((]" & questionIndex & "[\))]).*") Then
'记录问题答案
Answer = oneP.innerText
IsAnswer = True
'Exit For
End If
Else
Debug.Print oneP.innerText
If RegTest(oneP.innerText, "[\((](\d)[\))].*") Or RegTest(oneP.innerText, "(\d{1,2})[..]\D.*") Then
Exit For
Else
Answer = Answer & oneP.innerText
End If
End If
End If
End If
End If
Next oneP
'图片地址处理
' ImageURL = Mid(ImageURL, 2)
'测试
'Debug.Print ImageURL
Debug.Print Question
Debug.Print Answer
End With
'<span style="font-family:">43.</span>
'【补丁,有待改进】2017年下半年部分图片提取不到的问题修正
ImageURL = ""
If Len(ImageURL) = 0 Then
Debug.Print "补丁"
'Debug.Print "____________________"; hasimagetext
'Stop
'Debug.Print InStr(HasImageText, sep)
' HasImageText = Split(HasImageText, sep)(1)
Debug.Print ">>>>>>>>>>>>>>>>>>"; HasImageText
'Debug.Print InStr(HasImageText, "real_src")
'HasImageText = UCase(HasImageText)
'Debug.Print RegTest(HasImageText, "real_src =""(http.*?)""")
imgs = RegGetArray(HasImageText, "real_src =""(http.*?)""")
For n = LBound(imgs) To UBound(imgs) Step 1
'Debug.Print imgs(n)
ImageURL = ImageURL & "|" & imgs(n)
Next n
'Stop
ImageURL = Mid(ImageURL, 2)
Debug.Print "所有图片地址:"; ImageURL
'hasimagetext = Split(hasimagetext, "real_src")(UBound(Split(hasimagetext, "real_src")))
'ImageURL = Split(hasimagetext, """")(1)
End If
'输出题目内容到Word文档
Dim wdApp As Object
Dim Doc As Object
On Error Resume Next
Set wdApp = GetObject(, "Word.Application")
On Error GoTo 0
If Not wdApp Is Nothing Then
wdApp.Visible = True
On Error Resume Next
Set Doc = wdApp.documents(docName)
On Error GoTo 0
If Doc Is Nothing Then
Set Doc = wdApp.documents.Add()
Doc.SaveAs docPath
End If
Else
Set wdApp = CreateObject("Word.Application")
wdApp.Visible = True
If Dir(docPath) <> "" Then
Set Doc = wdApp.documents.Open(docPath)
Else
Set Doc = wdApp.documents.Add()
Doc.SaveAs docPath
End If
End If
Doc.Activate
wdApp.Selection.EndKey 6
wdApp.Selection.TypeParagraph
wdApp.Selection.InsertBreak 7
'输出题干内容
Debug.Print Subject
Subject = RegReplace(Subject, "(" & SubjectIndex & "[\..])") & "."
Debug.Print Subject
'Stop
wdApp.Selection.TypeText Text:=Subject
wdApp.Selection.TypeParagraph
'下载图片并插入WORD文档
If ImageURL <> "" Then
If InStr(ImageURL, "|") = 0 Then
ImagePath = ThisWorkbook.Path & Application.PathSeparator & "tmp.jpg"
DownloadImageName ImageURL, ImagePath
wdApp.Selection.InlineShapes.AddPicture Filename:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
wdApp.Selection.TypeParagraph
Kill ImagePath
'Stop'
Else
ImageURLs = Split(ImageURL, "|")
For n = LBound(ImageURLs) To UBound(ImageURLs) Step 1
ImagePath = ThisWorkbook.Path & Application.PathSeparator & n & "tmp.jpg"
DownloadImageName ImageURLs(n), ImagePath
Debug.Print ImagePath
wdApp.Selection.InlineShapes.AddPicture Filename:=ImagePath, LinkToFile:=False, SaveWithDocument:=True
wdApp.Selection.TypeParagraph
Kill ImagePath
Next n
End If
End If
'输出问题内容
wdApp.Selection.TypeText Text:=RegReplace(Question, "([\((]\d[\))])")
wdApp.Selection.TypeParagraph
'输出答案内容
sp = RegGet(Answer, "([\((]" & questionIndex & "[\))]).*")
'Debug.Print Sp
If Len(sp) > 0 Then
Answer = Split(Answer, sp)(1)
sp = RegGet(Answer, "([\((]" & questionIndex + 1 & "[\))]).*")
If Len(sp) > 0 Then
Answer = Split(Answer, sp)(0)
End If
End If
Debug.Print Answer
Answer = RegReplace(Answer, "(【来源】.*)")
Answer = RegReplace(Answer, "(【解析】.*)")
Debug.Print Answer
'Stop
wdApp.Selection.TypeText Text:="【答案】" & Answer
wdApp.Selection.TypeParagraph
wdApp.Selection.TypeText Text:="[ 来源:" & Source & " 第" & SubjectIndex & "题 第(" & questionIndex & ")问 ]"
wdApp.Selection.TypeParagraph
Set wdApp = Nothing
Set Doc = Nothing
Set oneP = Nothing
End Sub
Private Function RegTest(ByVal OrgText As String, ByVal Pattern As String) As Boolean
'传递参数 :原字符串, 匹配模式
Dim Regex As Object
Set Regex = CreateObject("VBScript.RegExp")
With Regex
.Global = True
.Pattern = Pattern
End With
RegTest = Regex.test(OrgText)
Set Regex = Nothing
End Function
Public Function RegGet(ByVal OrgText As String, ByVal Pattern As String) As String
'传递参数 :原字符串, 匹配模式
Dim Regex As Object
Dim Mh As Object
Set Regex = CreateObject("VBScript.RegExp")
With Regex
.Global = True
.Pattern = Pattern
End With
If Regex.test(OrgText) Then
Set Mh = Regex.Execute(OrgText)
RegGet = Mh.Item(0).submatches(0)
Else
RegGet = ""
End If
Set Regex = Nothing
End Function
Sub SetFontRed(ByVal Rng As Range)
With Rng.Font
.Color = -16776961
.TintAndShade = 0
End With
End Sub
Public Function RegReplace(ByVal OrgText As String, ByVal Pattern As String, Optional RepStr As String = "") As String
'传递参数 :原字符串, 匹配模式 ,替换字符
Dim Regex As Object
Dim newText As String
Set Regex = CreateObject("VBScript.RegExp")
With Regex
.Global = True
.Pattern = Pattern
End With
newText = Regex.Replace(OrgText, RepStr)
RegReplace = newText
Set Regex = Nothing
End Function
Public Function RegGetArray(ByVal OrgText As String, ByVal Pattern As String) As String()
Dim Reg As Object, Mh As Object, OneMh As Object
Dim Arr() As String, Index As Long
Dim Elm As String
Set Reg = CreateObject("Vbscript.Regexp")
With Reg
.MultiLine = True
.Global = True
.Ignorecase = False
.Pattern = Pattern
Set Mh = .Execute(OrgText)
Index = 0
ReDim Arr(1 To 1)
For Each OneMh In Mh
Index = Index + 1
ReDim Preserve Arr(1 To Index)
'If OneMh.submatches(0) <> "" Then Elm = OneMh.submatches(0)
Arr(Index) = OneMh.submatches(0)
'Debug.Print OneMh.submatches(0)
Next OneMh
End With
RegGetArray = Arr
Set Reg = Nothing
Set Mh = Nothing
End Function
Function RealInnerHtml(ByVal OrgInnerHtml) As String
Dim x As String
x = OrgInnerHtml
x = Replace(x, "SPAN", "span")
x = Replace(x, "FONT-SIZE", "font-size")
x = Replace(x, "FONT-FAMILY", "font-family")
x = Replace(x, "FONT", "font")
x = Replace(x, "WBR", "wbr")
x = Replace(x, "COLOR", "color")
RealInnerHtml = x
End Function
Public Function RegGetLast(ByVal OrgText As String, ByVal Pattern As String) As String
'传递参数 :原字符串, 匹配模式
Dim Regex As Object
Dim Mh As Object
Set Regex = CreateObject("VBScript.RegExp")
With Regex
.Global = True
.Pattern = Pattern
End With
If Regex.test(OrgText) Then
Set Mh = Regex.Execute(OrgText)
'RegGetLast = Mh.Item(0).submatches(0)
For Each OneMh In Mh
RegGetLast = OneMh.submatches(0)
Next OneMh
Else
RegGetLast = ""
End If
Set Regex = Nothing
End Function