聊聊Spring AI Alibaba的PdfTablesParser

本文主要研究一下Spring AI Alibaba的PdfTablesParser

PdfTablesParser

community/document-parsers/spring-ai-alibaba-starter-document-parser-pdf-tables/src/main/java/com/alibaba/cloud/ai/parser/pdf/tables/PdfTablesParser.java

public class PdfTablesParser implements DocumentParser {

	/**
	 * The page number of the PDF file to be parsed. Default value is 1.
	 */
	private final Integer page;

	/**
	 * The metadata of the PDF file to be parsed.
	 */
	private final Map<String, String> metadata;

	public PdfTablesParser() {

		this(1);
	}

	public PdfTablesParser(Integer pageNumber) {

		this(pageNumber, Map.of());
	}

	public PdfTablesParser(Integer pageNumber, Map<String, String> metadata) {

		this.page = pageNumber;
		this.metadata = metadata;
	}

	@Override
	public List<Document> parse(InputStream inputStream) {

		try {
			return data2Document(parseTables(extraTableData(inputStream)));
		}
		catch (Exception e) {
			throw new RuntimeException(e);
		}
	}

	protected List<Table> extraTableData(InputStream in) throws Exception {

		PDDocument document = PDDocument.load(in);

		// check pdf files
		int numberOfPages = document.getNumberOfPages();
		if (numberOfPages < 0) {

			throw new RuntimeException("No page found in the PDF file.");
		}

		if (page > numberOfPages) {

			throw new RuntimeException("The page number is greater than the number of pages in the PDF file.");
		}

		SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();

		// extract page by page numbers.
		Page extract = new ObjectExtractor(document).extract(this.page);

		return sea.extract(extract);
	}

	protected List<String> parseTables(List<Table> data) {

		if (data.isEmpty()) {
			return Collections.emptyList();
		}

		return data.stream()
			.flatMap(table -> table.getRows()
				.stream()
				.map(cells -> cells.stream()
					.map(content -> content.getText().replace("\r", "").replace("\n", " "))
					.reduce((first, second) -> first + "|" + second)
					.orElse("") + "|"))
			.collect(Collectors.toList());
	}

	private List<Document> data2Document(List<String> data) {

		List<Document> documents = new ArrayList<>();

		if (data.isEmpty()) {
			return null;
		}

		for (String datum : data) {
			Document doc = new Document(datum);
			documents.add(addMetadata(doc));
		}

		return documents;
	}

	private Document addMetadata(Document document) {

		if (metadata.isEmpty()) {
			return document;
		}

		for (Map.Entry<String, String> entry : metadata.entrySet()) {
			document.getMetadata().put(entry.getKey(), entry.getValue());
		}

		return document;
	}

}

PdfTablesParser使用tabula来解析pdf,它先执行extraTableData、再执行parseTables,最后执行data2Document;extraTableData方法使用SpreadsheetExtractionAlgorithm去解析为List<Table>,parseTables则将List<Table>解析为List<String>,data2Document方法则将List<String>解析为List<Document>

示例

class PdfTablesParserTests {

	private Resource resource;

	private Resource resource2;

	@BeforeEach
	void setUp() {

		resource = new DefaultResourceLoader().getResource("classpath:/pdf-tables.pdf");
		resource2 = new DefaultResourceLoader().getResource("classpath:/sample1.pdf");

		if (!resource.exists()) {
			throw new RuntimeException("Resource not found: " + resource);
		}
	}

	/**
	 * tabula-java use.
	 */
	@Test
	void PdfTableTest() throws IOException {

		InputStream in = new FileInputStream(resource.getFile());
		try (PDDocument document = PDDocument.load(in)) {
			SpreadsheetExtractionAlgorithm sea = new SpreadsheetExtractionAlgorithm();
			PageIterator pi = new ObjectExtractor(document).extract();
			while (pi.hasNext()) {
				// iterate over the pages of the document
				Page page = pi.next();
				List<Table> table = sea.extract(page);
				// iterate over the tables of the page
				for (Table tables : table) {
					List<List<RectangularTextContainer>> rows = tables.getRows();
					// iterate over the rows of the table
					for (List<RectangularTextContainer> cells : rows) {
						// print all column-cells of the row plus linefeed
						for (RectangularTextContainer content : cells) {
							// Note: Cell.getText() uses \r to concat text chunk
							String text = content.getText().replace("\r", " ");
							System.out.print(text + "|");
						}
						System.out.println();
					}
				}
			}
		}

	}

	@Test
	void PdfTablesParseTest() throws IOException {

		String res = """
				|name|age|sex|
				|zhangsan|20|m|
				|lisi|21|w|
				|wangwu|22|m|
				|zhangliu|23|w|
				|songqi|24|w|
				""";

		InputStream in = new FileInputStream(resource.getFile());
		PdfTablesParser pdfTablesParser = new PdfTablesParser();
		List<Document> docs = pdfTablesParser.parse(in);

		StringBuilder sb = new StringBuilder();
		docs.subList(1, docs.size()).forEach(doc -> sb.append(doc.getText() + "\n"));

		Assert.equals(res, sb.toString());
	}

	@Test
	void PdfTablesParseTest2() throws IOException {

		String res = """
				Sample Date:|May 2001|
				Prepared by:|Accelio Present Applied Technology|
				Created and Tested Using:|•Accelio Present Central 5.4•Accelio Present Output Designer 5.4|
				Features Demonstrated:|•Primary bookmarks in a PDF file.•Secondary bookmarks in a PDF file.|
				""";

		InputStream in = new FileInputStream(resource2.getFile());
		PdfTablesParser pdfTablesParser = new PdfTablesParser();
		List<Document> docs = pdfTablesParser.parse(in);

		StringBuilder sb = new StringBuilder();
		docs.forEach(doc -> sb.append(doc.getText() + "\n"));

		Assert.equals(res, sb.toString());

	}

	@Test
	void PdfTablesParseTest3() throws IOException {

		String res = """
				|Filename|||escription|escription||
				|||||||
				ap_bookmark.IFD|The template design.||||||
				ap_bookmark.mdf|The template targeted for PDF output.||||||
				ap_bookmark.dat|A sample data file in DAT format.||||||
				ap_bookmark.bmk|A sample bookmark file.||||||
				ap_bookmark.pdf|Sample PDF output.||||||
				ap_bookmark_doc.pdf|A document describing the sample.||||||
				|To bookmark by|Use the command line parameter||
				|Invoices|-abmkap_bookmark.bmk -abmsinvoices||
				|Type|-abmkap_bookmark.bmk -abmstype||
				|Amount|-abmkap_bookmark.bmk -abmsamount||
				""";

		InputStream in = new FileInputStream(resource2.getFile());
		PdfTablesParser pdfTablesParser = new PdfTablesParser(3);
		List<Document> docs = pdfTablesParser.parse(in);

		StringBuilder sb = new StringBuilder();
		docs.forEach(doc -> sb.append(doc.getText() + "\n"));

		Assert.equals(res, sb.toString());

	}

}

小结

Spring AI Alibaba的spring-ai-alibaba-starter-document-parser-pdf-tables提供了PdfTablesParser用于解析pdf文件中的表格数据到Document。

doc

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值