二话部署先上图
问如何解析pdf中红色部分内容?
- 尝试tika,失败。tika只能解析到pdf中的文字,无法定位红色部分。
- 尝试pdfpox,依然失败,没有找到能够定位的api。
- 同事给出的思路能不能转成html然后在解析。
bingo
这一思路可行
引入maven依赖
<!--pdf转htmljar,pdf2dom,pdfbox,pdfbox-tools-->
<dependency>
<groupId>net.sf.cssbox</groupId>
<artifactId>pdf2dom</artifactId>
<version>1.7</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.12</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox-tools</artifactId>
<version>2.0.12</version>
</dependency>
import com.laibo.reduce.reduce_center.common.exception.GeneralException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.fit.pdfdom.PDFDomTree;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.stereotype.Component;
import java.io.*;
import java.util.HashMap;
import java.util.Map;
@Component
public class PdfToHtmlUtil {
public static final String PDF_SUFFIX = ".pdf";
public static final String HTML_SUFFIX = ".html";
public static void main(String[] args) {
String path = "C:\\Users\\laibo\\Desktop\\temp\\test.pdf";
pdfToHtml(path);
}
/**
* 将指定路径的pdf转成Html
* @param path 文件路径
*/
public static void pdfToHtml(String path) {
if (path == null || path.length() == 0) {
throw new GeneralException("path为空!");
}
if (!path.endsWith(PDF_SUFFIX)) {
throw new GeneralException("文件类型不是pdf,请检查文件类型!");
}
int endIndex = path.indexOf(PDF_SUFFIX);
String pathPrefixWithFileName = path.substring(0, endIndex);
String outputPath = pathPrefixWithFileName + HTML_SUFFIX;
byte[] bytes = getBytes(path);
pdfBytesToHtml(bytes,outputPath);
}
/**
* 将pdf的字节数组转成html
* @param bytes 字节数组
* @param outputPath html输出路径
*/
public static void pdfBytesToHtml(byte[] bytes,String outputPath){
/**
* try() 写在()里面会自动关闭流
*/
try (BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(new File(outputPath)), "UTF-8"));) {
/**
* 加载PDF文档
*/
PDDocument document = PDDocument.load(bytes);
PDFDomTree pdfDomTree = new PDFDomTree();
pdfDomTree.writeText(document, out);
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 将文件转换为byte数组
* @param filePath
* @return byte[]
*/
private static byte[] getBytes(String filePath) {
byte[] buffer = null;
try {
File file = new File(filePath);
FileInputStream fis = new FileInputStream(file);
ByteArrayOutputStream bos = new ByteArrayOutputStream(1000);
byte[] b = new byte[1000];
int n;
while ((n = fis.read(b)) != -1) {
bos.write(b, 0, n);
}
fis.close();
bos.close();
buffer = bos.toByteArray();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return buffer;
}
}
生成html完成
再使用jsoup解析即可