PDF解析

pom依赖

<dependencies>
	<dependency>
		<groupId>e-iceblue</groupId>
		<artifactId>spire.pdf</artifactId>
		<version>8.7.0</version>
	</dependency>
</dependencies>
<repositories>
	<repository>
		<id>com.e-iceblue</id>
		<url>https://repo.e-iceblue.cn/repository/maven-public</url>
	</repository>
</repositories>

Model代码

import com.spire.pdf.PdfDocument;
import com.spire.pdf.PdfPageBase;
import com.spire.pdf.utilities.PdfTable;
import com.spire.pdf.utilities.PdfTableExtractor;
import com.spire.pdf.widget.PdfPageCollection;
import io.swagger.annotations.Api;
import io.swagger.annotations.ApiOperation;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
import org.springframework.web.multipart.MultipartFile;

import java.io.IOException;
import java.io.InputStream;

/**
 * PDF管理
 */
@Slf4j
@RestController
@RequestMapping(value = "/pdf")
@Api(value = "PdfController", tags = "PDF管理")
public class PdfController {
	@ApiOperation(value = "读取PDf")
	@PostMapping(value = "/read")
	String readPdf(@RequestParam("file") MultipartFile file) throws IOException {
		InputStream inputStream = file.getInputStream();
		PdfDocument pdfDocument = new PdfDocument();
		pdfDocument.loadFromStream(inputStream);
		
		log.info("title:======={}", pdfDocument.getDocumentInformation().getTitle());
		PdfPageCollection pages = pdfDocument.getPages();
		StringBuilder textBuilder = new StringBuilder();
		PdfTableExtractor pdfTableExtractor = new PdfTableExtractor(pdfDocument);
		for (int i = 0; i < pages.getCount(); i++) {
			log.info("i:======={}", i);
			PdfPageBase pdfPage = pages.get(i);
			// 去除文字前后空白格
			textBuilder.append(pdfPage.extractText(false));
			for (PdfTable pdfTable : pdfTableExtractor.extractTable(i)) {
				int columnCount = pdfTable.getColumnCount();
				int rowCount = pdfTable.getRowCount();
				log.info("columnCount:======={}, rowCount======={}", columnCount, rowCount);
				String table = pdfTable.getText(1, 0);
				log.info("table:======={}", table);
			}
		}
		// 去除水印(未获取商业版权限会有警告字符串)
		String ignoreStr = "\\r\\n  Evaluation Warning : The document was created with Spire.PDF for Java.\\r\\n";
		String text = textBuilder.toString().replaceAll(ignoreStr, "");
		log.info("text:======={}", text);
		pdfDocument.close();
		return text;
		
	}
}

读取PDF文件中的信息 package com.zht; import java.io.File; import java.io.UnsupportedEncodingException; import java.sql.Connection; import java.sql.DriverManager; import java.sql.PreparedStatement; import java.sql.SQLException; import java.util.ArrayList; import java.util.List; import com.spire.pdf.PdfDocument; import com.spire.pdf.PdfPageBase; public class ReadPDF { public static void main(String[] args) { //需要复制的目标文件或目标文件夹 String pathname = "F:\\读取PDF中的信息"; // File file = new File(pathname); List list = new ArrayList(); readFile(pathname,list); for(int j=0;j<list.size();j++) { // System.out.println("当前第"+(j+1)+"个----"+list.get(j)); //创建PdfDocument实例 PdfDocument doc = new PdfDocument(); //加载PDF文件 doc.loadFromFile(list.get(j)); StringBuilder sb = new StringBuilder(); PdfPageBase page; //遍历PDF页面,获取文本 for (int i = 0; i < doc.getPages().getCount(); i++) { page = doc.getPages().get(i); sb.append(page.extractText(true)); } // System.out.println(sb.toString()); String str = getStr(sb.toString()); System.out.println(str); String[] arr = str.split(";"); String gh = ""; String gw = ""; for(int i=0;i<arr.length;i++) { arr[i] = arr[i].trim(); if(i==0) { gh = arr[i]; }else if(i==1) { gw = arr[i]; }else { arr[i] = arr[i].replace(gh, "").replace(gw, ""); } } // System.out.println(); insertSQL(arr); // FileWriter writer; // try { ////将文本写入文本文件 // writer = new FileWriter("f://ExtractText.txt"); // writer.write(sb.toString()); // writer.flush(); // } catch (IOException e) { // e.printStackTrace(); // } doc.close(); } } public static String getStr2(String str) { try { byte[] bs = str.getBytes("utf-8"); for(int i=0;i<bs.length;i++) { byte b = bs[i]; if(b==0) { bs[i]=9; } } str =
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值