1.Spire.PDF地址
Java 提取或读取 PDF 文本内容
2.实现步骤
1.项目中引入maven依赖
<dependency>
<groupId>e-iceblue</groupId>
<artifactId>spire.pdf</artifactId>
<version>10.3.4</version>
</dependency>
<repositories>
<repository>
<id>com.e-iceblue</id>
<name>e-iceblue</name>
<url>https://repo.e-iceblue.cn/repository/maven-public/</url>
</repository>
</repositories>
2.对于本地文件进行提取
package com.XXX;
import com.spire.pdf.PdfDocument;
import com.spire.pdf.PdfPageBase;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
public class PdfTestPoi {
public static void main(String[] args) throws IOException {
//创建PdfDocument实例
PdfDocument doc = new PdfDocument();
//加载PDF文件
doc.loadFromFile("本地文件地址");
//创建StringBuilder实例
StringBuilder stringBuilder = new StringBuilder();
PdfPageBase page;
//遍历PDF页面,获取每个页面的文本并添加到StringBuilder对象
for (int i = 0; i < doc.getPages().getCount(); i++) {
page = doc.getPages().get(i);
stringBuilder.append(page.extractText(true));
}
System.out.println(stringBuilder);
doc.close();
}
}
3.对于网络Pdf资源进行提取
package com.vts;
import com.spire.pdf.PdfDocument;
import com.spire.pdf.PdfPageBase;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.net.URLConnection;
public class PdfTestPoi {
public static void main(String[] args) throws IOException {
//创建PdfDocument实例
PdfDocument doc = new PdfDocument();
//加载PDF文件
URL url = new URL("网络资源地址");
URLConnection urlConnection = url.openConnection();
InputStream inputStream = urlConnection.getInputStream();
doc.loadFromStream(inputStream);
//创建StringBuilder实例
StringBuilder stringBuilder = new StringBuilder();
PdfPageBase page;
//遍历PDF页面,获取每个页面的文本并添加到StringBuilder对象
for (int i = 0; i < doc.getPages().getCount(); i++) {
page = doc.getPages().get(i);
stringBuilder.append(page.extractText(true));
}
System.out.println(stringBuilder);
doc.close();
}
}