PDF文件内容读取
一、PDF表格读取
可以读取表格内容,以及表格坐标
1、maven依赖
<!-- openPdf 可以读取标签-->
<dependency>
<groupId>com.github.librepdf</groupId>
<artifactId>openpdf</artifactId>
<version>1.3.26</version>
</dependency>
<dependency>
<groupId>technology.tabula</groupId>
<artifactId>tabula</artifactId>
<version>1.0.3</version>
<exclusions>
<exclusion>
<artifactId>slf4j-simple</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
</exclusions>
</dependency>
2、代码
import com.lowagie.text.Rectangle;
import com.lowagie.text.pdf.AcroFields;
import com.lowagie.text.pdf.PdfReader;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.ParseException;
import technology.tabula.CommandLineApp;
/**
* 根据网络路径,获取本地路径
* @param path
* @return
* @throws IOException
*/
private String getLocalPath(String path) throws IOException {
InputStream inputStream = null;
FileOutputStream fos = null;
try {
URL url = new URL(path);
HttpURLConnection conn = (HttpURLConnection)url.openConnection();
//设置超时间
conn.setConnectTimeout(5*1000);
//防止屏蔽程序抓取而返回403错误
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
inputStream = conn.getInputStream();
byte[] getData = readInputStream(inputStream);
File file = new File(savePath + File.separator + UUID.randomUUID() + ".pdf");
fos = new FileOutputStream(file);
fos.write(getData);
return file.getAbsolutePath();
} catch (Exception e) {
log.error("CaseFileSignatureVerifyTask-文件下载失败", e);
} finally {
if(fos!=null){
fos.close();
}
if(inputStream!=null){
inputStream.close();
}
}
return null;
}
private byte[] readInputStream(InputStream inputStream) throws IOException {
byte[] buffer = new byte[1024];
int len = 0;
ByteArrayOutputStream bos = new ByteArrayOutputStream();
while((len = inputStream.read(buffer)) != -1) {
bos.write(buffer, 0, len);
}
bos.close();
return bos.toByteArray();
}
//底层使用的new File, 因此pdf必须是本地文件
//读取pdf表格,包括内容以及坐标等
private double[] getRecipientTableCoordinate(String pdf) throws ParseException, IOException {
String localPath = this.getLocalPath(pdf);
if (StringUtil.isBlank(localPath)) {
throw BizException.serverError("文件路径获取失败");
}
try {
String[] args = new String[]{"-f=JSON", "-p=all", localPath};
CommandLineParser parser = new DefaultParser();
CommandLine cmd = parser.parse(CommandLineApp.buildOptions(), args);
StringBuilder stringBuilder = new StringBuilder();
new CommandLineApp(stringBuilder, cmd).extractTables(cmd);
Map<String, Object> stringStringMap = JsonUtil.jsonToMapObject(stringBuilder.substring(1, stringBuilder.length() - 1));
List<List<Map<String, Object>>> data = (ArrayList)stringStringMap.get("data");
for (List<Map<String, Object>> list : data) {
for (Map<String, Object> map : list) {
String text = (String)map.get("text");
if (StringUtils.isBlank(text)) {
continue;
}
if (text.contains("收件人签名") || text.contains("签名或盖章")) {
double[] coordinate = new double[2];
coordinate[0] = (double) map.get("top");
coordinate[1] = coordinate[0] + (double)map.get("height");
return coordinate;
}
}
}
} catch (Exception e) {
throw e;
} finally {
File file = new File(localPath);
if (file.exists()) {
file.delete();
}
}
return null;
}
/**
* openpdf获取签名信息
* @param pdf
* @return
* @throws Exception
*/
private boolean hasRecipientSigned(String pdf) throws Exception {
if (StringUtils.isBlank(pdf)) {
return false;
}
try (PdfReader reader = new PdfReader(pdf)) {
AcroFields fields = reader.getAcroFields();
List<String> signatures = fields.getSignedFieldNames();
System.out.println("签名数目: " + signatures.size());
for (String signature : signatures) {
float[] fieldPositions = fields.getFieldPositions(signature);
Rectangle pageSize = reader.getPageSize((int) fieldPositions[0]);
float height = pageSize.getHeight();
float bottomY = fieldPositions[2];
float topY = fieldPositions[4];
bottomY = height - bottomY;
topY = height - topY;
}
}
return false;
}
二、PDF内容读取
读取正常pdf内容
1、maven依赖
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.11</version>
</dependency>
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itext-asian</artifactId>
<version>5.2.0</version>
</dependency>
2、代码
private static void extract(String pdf) throws IOException {
//可以传入输入流创建 PdfReader对象,也可以使用文件路径创建 PdfReader对象
PdfReader reader = new PdfReader(pdf);
//创建pdf解析类
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
//获取pdf的页数
int pageNum = reader.getNumberOfPages();
String pageContent = "";
for (int i = 1; i <= pageNum; i++) {
// 只能从第1页开始读
pageContent += PdfTextExtractor.getTextFromPage(reader, i);
}
//pdf文件的所有内容
System.out.println("pageContent:" + pageContent.replace("\n",""));
}