做RPA时遇到了这个问题,记录下
Extract Data from PDF and Add to Worksheetstackoverflow.com主要有3种方法
- Adobe Library ByteScout PDF Extractor SDK 需要付费
- 用Word打开
- 通过命令行工具转换
对应代码
Adobe Library:
Function getTextFromPDF(ByVal strFilename As String) As String
Dim objAVDoc As New AcroAVDo
Dim objPDDoc As New AcroPDDoc
Dim objPage As AcroPDPage
Dim objSelection As AcroPDTextSelect
Dim objHighlight As AcroHiliteList
Dim pageNum As Long
Dim strText As String
'使用Adobe自己的库
strText = ""
If (objAVDoc.Open(strFilename, "")) Then
Set objPDDoc = objAVDoc.GetPDDoc
For pageNum = 0 To objPDDoc.GetNumPages() - 1
Set objPage = objPDDoc.AcquirePage(pageNum)
Set objHighlight = New AcroHiliteList
objHighlight.Add 0, 10000 '如果没有得到页面上的所有文本,请进行调整
Set objSelection = objPage.CreatePageHilite(objHighlight)
If Not objSelection Is Nothing Then
For tCount = 0 To objSelection.GetNumText - 1
strText = strText & objSelection.GetText(tCount)
Next tCount
End If
Next pageNum
objAVDoc.Close 1
End If
getTextFromPDF = strText
End Function
Word打开
- 选择默认程序以打开.pdf文件为Microsoft Word 第一次使用word打开.pdf文件时,会弹出一个对话框,要求您将word转换为.docx文件。
- 单击左下方指出“不再显示此消息”的复选框,然后单击“确定”。
- 创建一个宏,以从.docx文件中提取数据。
工具转换
Download Xpdf and XpdfReaderwww.xpdfreader.comDownload the Xpdf command line tools(下载Xpdf命令行工具):
Windows 32/64-bit: download
Sub ReadIntoExcel(PDFName As String)
'Convert PDF to text
Shell "C:Utilspdftotext.exe -layout " & PDFName & " tempfile.txt"
'Read in the text file and write to Excel
Dim TextLine as String
Dim RowNumber as Integer
Dim F1 as Integer
RowNumber = 1
F1 = Freefile()
Open "tempfile.txt" for Input as #F1
While Not EOF(#F1)
Line Input #F1, TextLine
ThisWorkbook.WorkSheets(1).Cells(RowNumber, 1).Value = TextLine
RowNumber = RowNumber + 1
Wend
Close #F1
End Sub