1.表格类的PDF文件,可以通过tabula解析
maven 依赖
<dependency>
<groupId>technology.tabula</groupId>
<artifactId>tabula</artifactId>
<version>1.0.3</version>
<exclusions>
<exclusion>
<artifactId>slf4j-simple</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
片段代码
public String getContent(byte[] bytes) {
String str = Base64.encodeBase64String(bytes);
String[] args = new String[]{"-f=JSON", "-p=all", str, "-l"};
CommandLineParser parser = new DefaultParser();
StringBuilder sb = new StringBuilder();
try {
CommandLine cmd = parser.parse(CommandLineApp.buildOptions(), args);
CommandLineApp commandLineApp = new CommandLineAppExt(sb, cmd);
commandLineApp.extractTables(cmd);
} catch (ParseException e) {
e.printStackTrace();
}
return sb.toString();
}
public class CommandLineAppExt extends CommandLineApp {
@Override
public void extractTables(CommandLine line) throws ParseException {
if (line.getArgs().length != 1) {
throw new ParseException("Need exactly one filename\nTry --help for help");
} else {
byte[] decode = Base64.decodeBase64(line.getArgs()[0]);
this.extractFileTables(decode);
}
}
public void extractFileTables(byte[] bytes) throws ParseException {
this.extractFile(bytes, this.defaultOutput);
}
private void extractFile(byte[] bytes, Appendable outFile) throws ParseException {
PDDocument pdfDocument = null;
try {
pdfDocument = this.password == null ? PDDocument.load(bytes) : PDDocument.load(bytes, this.password);
PageIterator pageIterator = this.getPageIterator(pdfDocument);
ArrayList tables = new ArrayList();
while (true) {
while (pageIterator.hasNext()) {
Page page = pageIterator.next();
Rectangle area;
if (this.pageAreas != null) {
for (Iterator var7 = this.pageAreas.iterator(); var7.hasNext(); tables.addAll(this.tableExtractor.extractTables(page.getArea(area)))) {
Pair<Integer, Rectangle> areaPair = (Pair) var7.next();
area = (Rectangle) areaPair.getRight();
if ((Integer) areaPair.getLeft() == 0) {
area = new Rectangle((float) ((double) (area.getTop() / 100.0F) * page.getHeight()), (float) ((double) (area.getLeft() / 100.0F) * page.getWidth()), (float) (area.getWidth() / 100.0D * page.getWidth()), (float) (area.getHeight() / 100.0D * page.getHeight()));
}
}
} else {
tables.addAll(this.tableExtractor.extractTables(page));
}
}
this.writeTables(tables, outFile);
return;
}
} catch (IOException var17) {
throw new ParseException(var17.getMessage());
} finally {
try {
if (pdfDocument != null) {
pdfDocument.close();
}
} catch (IOException var16) {
System.out.println("Error in closing pdf document" + var16);
}
}
}
}
说明:CommandLineAppExt类继承CommandLineApp类,为满足传入byte[]类型参数,做了改造,CommandLineApp类本身不支持传入byte[]类型数据,支持传入File文件类型。
2.pdf文件不是表格,而是平铺的文档,用spire解析
maven 依赖
<dependency>
<groupId>e-iceblue</groupId>
<artifactId>spire.pdf.free</artifactId>
<version>3.9.0</version>
</dependency>
<dependency>
<groupId>commons-cli</groupId>
<artifactId>commons-cli</artifactId>
<version>1.4</version>
</dependency>
片段代码
public String getContent(byte[] bytes) {
PdfDocument doc = new PdfDocument();
doc.loadFromBytes(bytes);
StringBuffer sb = new StringBuffer();
PdfPageBase page;
for (int i = 0; i < doc.getPages().getCount(); i++) {
//获取每一页的page对象
page = doc.getPages().get(i);
sb.append(page.extractText(true));
}
doc.close();
return sb.toString();
}