序
本文主要研究一下Spring AI Alibaba的PdfTablesParser
PdfTablesParser
community/document-parsers/spring-ai-alibaba-starter-document-parser-pdf-tables/src/main/java/com/alibaba/cloud/ai/parser/pdf/tables/PdfTablesParser.java
public class PdfTablesParser implements DocumentParser {
/**
* The page number of the PDF file to be parsed. Default value is 1.
*/
private final Integer page;
/**
* The metadata of the PDF file to be parsed.
*/
private final Map<String, String> metadata;
public PdfTablesParser() {
this(1);
}
public PdfTablesParser(Integer pageNumber) {
this(pageNumber, Map.of());
}
public PdfTablesParser(Integer pageNumber, Map<String, String> metadata) {
this.page = pageNumber;
this.metadata = metadata;
}
@Override
public List<Document> parse(InputStream inputStream) {
try {
return data2Document(parseTables(extraTableData(inputStream)));
}
catch (Exception e) {
throw new RuntimeException(e);
}
}
protected List<Table> extraTableData(InputStream in) throws Exception {
PDDocument document = PDDocument.load(in);
// check pdf files
int numberOfPages = document.getNumberOfPages();
if (numberOfPages < 0) {
throw new RuntimeException("No page found in the PDF file.");
}
if (page > numberOfPages) {
throw new RuntimeException("The page number is greater than the number of pages in the PDF file.");
}
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
// extract page by page numbers.
Page extract = new ObjectExtractor(document).extract(this.page);
return sea.extract(extract);
}
protected List<String> parseTables(List<Table> data) {
if (data.isEmpty()) {
return Collections.emptyList();
}
return data.stream()
.flatMap(table -> table.getRows()
.stream()
.map(cells -> cells.stream()
.map(content -> content.getText().replace("\r", "").replace("\n", " "))
.reduce((first, second) -> first + "|" + second)
.orElse("") + "|"))
.collect(Collectors.toList());
}
private List<Document> data2Document(List<String> data) {
List<Document> documents = new ArrayList<>();
if (data.isEmpty()) {
return null;
}
for (String datum : data) {
Document doc = new Document(datum);
documents.add(addMetadata(doc));
}
return documents;
}
private Document addMetadata(Document document) {
if (metadata.isEmpty()) {
return document;
}
for (Map.Entry<String, String> entry : metadata.entrySet()) {
document.getMetadata().put(entry.getKey(), entry.getValue());
}
return document;
}
}
PdfTablesParser使用tabula来解析pdf,它先执行extraTableData、再执行parseTables,最后执行data2Document;extraTableData方法使用SpreadsheetExtractionAlgorithm去解析为
List<Table>
,parseTables则将List<Table>
解析为List<String>
,data2Document方法则将List<String>
解析为List<Document>
示例
class PdfTablesParserTests {
private Resource resource;
private Resource resource2;
@BeforeEach
void setUp() {
resource = new DefaultResourceLoader().getResource("classpath:/pdf-tables.pdf");
resource2 = new DefaultResourceLoader().getResource("classpath:/sample1.pdf");
if (!resource.exists()) {
throw new RuntimeException("Resource not found: " + resource);
}
}
/**
* tabula-java use.
*/
@Test
void PdfTableTest() throws IOException {
InputStream in = new FileInputStream(resource.getFile());
try (PDDocument document = PDDocument.load(in)) {
SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
PageIterator pi = new ObjectExtractor(document).extract();
while (pi.hasNext()) {
// iterate over the pages of the document
Page page = pi.next();
List<Table> table = sea.extract(page);
// iterate over the tables of the page
for (Table tables : table) {
List<List<RectangularTextContainer>> rows = tables.getRows();
// iterate over the rows of the table
for (List<RectangularTextContainer> cells : rows) {
// print all column-cells of the row plus linefeed
for (RectangularTextContainer content : cells) {
// Note: Cell.getText() uses \r to concat text chunk
String text = content.getText().replace("\r", " ");
System.out.print(text + "|");
}
System.out.println();
}
}
}
}
}
@Test
void PdfTablesParseTest() throws IOException {
String res = """
|name|age|sex|
|zhangsan|20|m|
|lisi|21|w|
|wangwu|22|m|
|zhangliu|23|w|
|songqi|24|w|
""";
InputStream in = new FileInputStream(resource.getFile());
PdfTablesParser pdfTablesParser = new PdfTablesParser();
List<Document> docs = pdfTablesParser.parse(in);
StringBuilder sb = new StringBuilder();
docs.subList(1, docs.size()).forEach(doc -> sb.append(doc.getText() + "\n"));
Assert.equals(res, sb.toString());
}
@Test
void PdfTablesParseTest2() throws IOException {
String res = """
Sample Date:|May 2001|
Prepared by:|Accelio Present Applied Technology|
Created and Tested Using:|•Accelio Present Central 5.4•Accelio Present Output Designer 5.4|
Features Demonstrated:|•Primary bookmarks in a PDF file.•Secondary bookmarks in a PDF file.|
""";
InputStream in = new FileInputStream(resource2.getFile());
PdfTablesParser pdfTablesParser = new PdfTablesParser();
List<Document> docs = pdfTablesParser.parse(in);
StringBuilder sb = new StringBuilder();
docs.forEach(doc -> sb.append(doc.getText() + "\n"));
Assert.equals(res, sb.toString());
}
@Test
void PdfTablesParseTest3() throws IOException {
String res = """
|Filename|||escription|escription||
|||||||
ap_bookmark.IFD|The template design.||||||
ap_bookmark.mdf|The template targeted for PDF output.||||||
ap_bookmark.dat|A sample data file in DAT format.||||||
ap_bookmark.bmk|A sample bookmark file.||||||
ap_bookmark.pdf|Sample PDF output.||||||
ap_bookmark_doc.pdf|A document describing the sample.||||||
|To bookmark by|Use the command line parameter||
|Invoices|-abmkap_bookmark.bmk -abmsinvoices||
|Type|-abmkap_bookmark.bmk -abmstype||
|Amount|-abmkap_bookmark.bmk -abmsamount||
""";
InputStream in = new FileInputStream(resource2.getFile());
PdfTablesParser pdfTablesParser = new PdfTablesParser(3);
List<Document> docs = pdfTablesParser.parse(in);
StringBuilder sb = new StringBuilder();
docs.forEach(doc -> sb.append(doc.getText() + "\n"));
Assert.equals(res, sb.toString());
}
}
小结
Spring AI Alibaba的spring-ai-alibaba-starter-document-parser-pdf-tables
提供了PdfTablesParser用于解析pdf文件中的表格数据到Document。