文件依赖
<dependency>
<groupId>technology.tabula</groupId>
<artifactId>tabula</artifactId>
<version>1.0.3</version>
<exclusions>
<exclusion>
<artifactId>slf4j-simple</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
还有 fastjson 用于将data处理为json格式
表格对应的key value对象
public class Entry {
String num;
String data1;
String data2;
String data3;
String data4;
@Override
public String toString() {
return "Entry{" +
"num='" + num + '\'' +
", data1='" + data1 + '\'' +
", data2='" + data2 + '\'' +
", data3='" + data3 + '\'' +
", data4='" + data4 + '\'' +
'}';
}
public String getNum() {
return num;
}
public void setNum(String num) {
this.num = num;
}
public String getData1() {
return data1;
}
public void setData1(String data1) {
this.data1 = data1;
}
public String getData2() {
return data2;
}
public void setData2(String data2) {
this.data2 = data2;
}
public String getData3() {
return data3;
}
public void setData3(String data3) {
this.data3 = data3;
}
public String getData4() {
return data4;
}
public void setData4(String data4) {
this.data4 = data4;
}
}
实际操作方法
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.ParseException;
import org.apache.commons.lang3.StringUtils;
import technology.tabula.CommandLineApp;
import java.util.ArrayList;
import java.util.List;
public class PdfTableParser {
public static void main(String[] args) throws ParseException {
//-f导出格式,默认CSV (一定要大写)
//-p 指导出哪页,all是所有
//path F:\文字文稿1_20241113090441.pdf
//-l 强制使用点阵模式提取PDF (关键在于这儿)
String[] argsa = new String[]{"-f=JSON", "-p=2", "F:\\文字文稿1_20241113090441.pdf", "-l"};
//CommandLineApp.main(argsa);
CommandLineParser parser = new DefaultParser();
CommandLine cmd = parser.parse(CommandLineApp.buildOptions(), argsa);
StringBuilder stringBuilder = new StringBuilder();
new CommandLineApp(stringBuilder, cmd).extractTables(cmd);
String dataString = stringBuilder.toString();
System.out.println("打印返回数据: "+ dataString);
//解析tabula读取pdf表格,将返回的数据转成jsonArray
JSONArray jsonArray = new JSONArray();
Object parse = JSONObject.parse(dataString);
jsonArray.add(parse);
List<Entry> listInfo = new ArrayList<>();
for (int i = 0; i < jsonArray.size(); i++) {
//获取每个页
JSONArray jsonPage = jsonArray.getJSONArray(i);
//遍历页
for (int j = 0; j < jsonPage.size(); j++) {
//获取每页中的data
JSONArray dataArr = jsonPage.getJSONObject(j).getJSONArray("data");
//遍历data中的每个单元格
for (int k = 0; k < dataArr.size(); k++) {
//遍历data中的每一条,也就是单元格中的每一行
JSONArray dataD = dataArr.getJSONArray(k);
String xuhao = dataD.getJSONObject(0).get("text").toString().replaceAll("\r", "");
//如果第1个单元格的数据是序号,则跳出个这个循环
if (xuhao.contains("序号")) {
continue;
}
Entry info = new Entry();
//通过下标获取每个单元格的数据,下标是固定的 ,最多有9个单元格
info.setNum(dataD.getJSONObject(1).get("text").toString().replaceAll("\r", ""));
info.setData1(dataD.getJSONObject(2).get("text").toString().replaceAll("\r", ""));
info.setData2(dataD.getJSONObject(3).get("text").toString().replaceAll("\r", ""));
info.setData3(dataD.getJSONObject(4).get("text").toString().replaceAll("\r", ""));
info.setData4(dataD.getJSONObject(5).get("text").toString().replaceAll("\r", ""));
listInfo.add(info);
}
}
}
for (int i = 0; i < listInfo.size(); i++) {
//遍历删除序号是空的数据
if (StringUtils.isBlank(listInfo.get(i).getNum())) {
listInfo.remove(listInfo.get(i));
}
}
listInfo.forEach(System.out::println);
}
}
文件内容
输出结果
[main] INFO org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB - To get higher rendering speed on JDK8 or later,
[main] INFO org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB - use the option -Dsun.java2d.cmm=sun.java2d.cmm.kcms.KcmsServiceProvider
[main] INFO org.apache.pdfbox.pdmodel.graphics.color.PDDeviceRGB - or call System.setProperty("sun.java2d.cmm", "sun.java2d.cmm.kcms.KcmsServiceProvider")
打印返回数据: [{"extraction_method":"lattice","top":87.67198,"left":84.46285,"width":426.4116516113281,"height":110.67039489746094,"right":510.8745,"bottom":198.34238,"data":[[{"top":87.67198,"left":84.46285,"width":71.13715362548828,"height":16.268020629882812,"text":"序号"},{"top":87.67198,"left":155.6,"width":71.00001525878906,"height":16.268020629882812,"text":"测试"},{"top":87.67198,"left":226.60002,"width":70.99998474121094,"height":16.268020629882812,"text":"内容 1"},{"top":87.67198,"left":297.6,"width":71.0,"height":16.268020629882812,"text":"内容 2"},{"top":87.67198,"left":368.6,"width":71.04998779296875,"height":16.268020629882812,"text":"内容三"},{"top":87.67198,"left":439.65,"width":71.22451782226562,"height":16.268020629882812,"text":"内容 4"}],[{"top":103.94,"left":84.46285,"width":71.13715362548828,"height":47.29998779296875,"text":"1"},{"top":103.94,"left":155.6,"width":71.00001525878906,"height":47.29998779296875,"text":"123\rasas\r789"},{"top":103.94,"left":226.60002,"width":70.99998474121094,"height":47.29998779296875,"text":"qwe\rqwe\rqwe"},{"top":103.94,"left":297.6,"width":71.0,"height":47.29998779296875,"text":"asd\rasd\rasd"},{"top":103.94,"left":368.6,"width":71.04998779296875,"height":47.29998779296875,"text":"dfg\rdfg\rdfg"},{"top":103.94,"left":439.65,"width":71.22451782226562,"height":47.29998779296875,"text":"123\r456\r789"}],[{"top":151.23999,"left":84.46285,"width":71.13715362548828,"height":47.102386474609375,"text":"2"},{"top":151.23999,"left":155.6,"width":71.00001525878906,"height":47.102386474609375,"text":"qwfqfw\rqwfq\rqwqw"},{"top":151.23999,"left":226.60002,"width":70.99998474121094,"height":47.102386474609375,"text":"sassaas\rsaas\rsasas"},{"top":151.23999,"left":297.6,"width":71.0,"height":47.102386474609375,"text":"zxc\rzxc\rzxc"},{"top":151.23999,"left":368.6,"width":71.04998779296875,"height":47.102386474609375,"text":"cvb\rcvb\rcvb"},{"top":151.23999,"left":439.65,"width":71.22451782226562,"height":47.102386474609375,"text":"rty\rrty\rrty"}]]}]
Entry{num='123asas789', data1='qweqweqwe', data2='asdasdasd', data3='dfgdfgdfg', data4='123456789'}
Entry{num='qwfqfwqwfqqwqw', data1='sassaassaassasas', data2='zxczxczxc', data3='cvbcvbcvb', data4='rtyrtyrty'}
Process finished with exit code 0