1.引入MAVEN依赖

<dependency>
<groupId>com.aspose.cells</groupId>
<artifactId>aspose-cells</artifactId>
<version>cell-8.5.2</version>
<scope>system</scope>
<systemPath>${project.basedir}/src/main/resources/lib/aspose-cells-17.7.jar</systemPath>
</dependency>
<!-- https://mvnrepository.com/artifact/com.itextpdf/itextpdf -->
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.12</version>
</dependency>
2.Excel转PDF
/**
* excel 转换 Pdf
*
* @param fileFullPath 旧文件全路径 /home/2022-07/保密协议.xlsx
*/
public static String excelPdf(String fileFullPath) {
if (!getLicense()) { // 验证License 若不验证则转化出的pdf文档会有水印产生
return null;
}
FileInputStream excelstream = null;
Workbook wb = null;
FileOutputStream fileOS = null;
PdfSaveOptions pdfSaveOptions = null;
try {
String path = fileFullPath.substring(0, fileFullPath.lastIndexOf("/") + 1);
String fileName = fileFullPath.substring(fileFullPath.lastIndexOf("/") + 1, fileFullPath.lastIndexOf("."));
String outFIleFullPath = path + fileName + ".pdf";
File pdfFile = new File(outFIleFullPath); // 输出路径
excelstream = new FileInputStream(fileFullPath);
wb = new Workbook(excelstream);// excel路径,这里是先把数据放进缓存表里,然后把缓存表转化成PDF
fileOS = new FileOutputStream(pdfFile);
pdfSaveOptions = new PdfSaveOptions();
pdfSaveOptions.setOnePagePerSheet(true);//参数true把内容放在一张PDF页面上;
wb.save(fileOS, pdfSaveOptions);
fileOS.close();
return outFIleFullPath;
} catch (Exception e) {
e.printStackTrace();
log.info("excel转换异常");
return null;
} finally {
try {
excelstream.close();
fileOS.close();
} catch (IOException e) {
throw new RuntimeException(e);
}
}
}
public static boolean getLicense() {
boolean result = false;
InputStream is = null;
try {
is =
FileUtil.class
.getClassLoader()
.getResourceAsStream(
"license.xml"); //
// license.xml这个文件你放在静态文件资源目录下就行了
License aposeLic = new License();
aposeLic.setLicense(is);
result = true;
} catch (Exception e) {
e.printStackTrace();
}
return result;
}
3.word转PDF
public static String wordTurnPdf(String doc, String pdf) {
try {
//word转PDF pdf文件名
File outputFile = new File(pdf);
InputStream doc1 = new FileInputStream(doc);
OutputStream outputStream = new FileOutputStream(outputFile);
IConverter converter = LocalConverter.builder().build();
converter.convert(doc1).as(DocumentType.DOCX).to(outputStream).as(DocumentType.PDF).execute();
outputStream.close();
converter.shutDown();
} catch (Exception e) {
e.printStackTrace();
}
return pdf;
}
4.PDF关键词定位识别
package com.datago.robot.common.utils;
import com.itextpdf.awt.geom.Rectangle2D.Float;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.*;
import lombok.extern.slf4j.Slf4j;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
/**
* 获取pdf 劳模所在页数
* HBO
* 2022-10-27
*/
@Slf4j
public class ReadPDF {
public static List<Integer> readPDF(String pdfFilePath, String keyword) {
//1.给定文件
// File pdfFile = new File("C:\\Users\\Administrator\\Desktop\\劳模先进基本信息模板-有数据.pdf");
File pdfFile = new File(pdfFilePath);
//2.定义一个byte数组,长度为文件的长度
byte[] pdfData = new byte[(int) pdfFile.length()];
//3.IO流读取文件内容到byte数组
FileInputStream inputStream = null;
try {
inputStream = new FileInputStream(pdfFile);
inputStream.read(pdfData);
} catch (Exception e) {
e.printStackTrace();
} finally {
if (inputStream != null) {
try {
inputStream.close();
} catch (IOException e) {
}
}
}
//4.指定关键字
// String keyword = "分部";
//5.调用方法,给定关键字和文件
List<float[]> positions = null;
try {
positions = findKeywordPostions(pdfData, keyword);
} catch (IOException e) {
throw new RuntimeException(e);
}
//6.返回值类型是 List<float[]> 每个list元素代表一个匹配的位置,分别为 float[0]所在页码 float[1]所在x轴 float[2]所在y轴
// System.out.println("total:" + positions.size());
List< Integer> list=new ArrayList();
if (positions != null && positions.size() > 0) {
for (float[] position : positions) {
// System.out.print("pageNum: " + (int) position[0]);
// System.out.print("\tx: " + position[1]);
// System.out.println("\ty: " + position[2]);
// map.put("pageNum", (int) position[0]);
list.add((int) position[0]);
}
}
return list;
}
/**
* findKeywordPostions
*
* @param pdfData 通过IO流 PDF文件转化的byte数组
* @param keyword 关键字
* @return List<float [ ]> : float[0]:pageNum float[1]:x float[2]:y
* @throws IOException
*/
public static List<float[]> findKeywordPostions(byte[] pdfData, String keyword) throws IOException {
List<float[]> result = new ArrayList<>();
List<PdfPageContentPositions> pdfPageContentPositions = getPdfContentPostionsList(pdfData);
for (PdfPageContentPositions pdfPageContentPosition : pdfPageContentPositions) {
List<float[]> charPositions = findPositions(keyword, pdfPageContentPosition);
if (charPositions == null || charPositions.size() < 1) {
continue;
}
result.addAll(charPositions);
}
return result;
}
private static List<PdfPageContentPositions> getPdfContentPostionsList(byte[] pdfData) throws IOException {
PdfReader reader = new PdfReader(pdfData);
List<PdfPageContentPositions> result = new ArrayList<>();
int pages = reader.getNumberOfPages();
for (int pageNum = 1; pageNum <= pages; pageNum++) {
float width = reader.getPageSize(pageNum).getWidth();
float height = reader.getPageSize(pageNum).getHeight();
PdfRenderListener pdfRenderListener = new PdfRenderListener(pageNum, width, height);
//解析pdf,定位位置
PdfContentStreamProcessor processor = new PdfContentStreamProcessor(pdfRenderListener);
PdfDictionary pageDic = reader.getPageN(pageNum);
PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
try {
processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);
} catch (IOException e) {
reader.close();
throw e;
}
String content = pdfRenderListener.getContent();
List<CharPosition> charPositions = pdfRenderListener.getcharPositions();
List<float[]> positionsList = new ArrayList<>();
for (CharPosition charPosition : charPositions) {
float[] positions = new float[]{charPosition.getPageNum(), charPosition.getX(), charPosition.getY()};
positionsList.add(positions);
}
PdfPageContentPositions pdfPageContentPositions = new PdfPageContentPositions();
pdfPageContentPositions.setContent(content);
pdfPageContentPositions.setPostions(positionsList);
result.add(pdfPageContentPositions);
}
reader.close();
return result;
}
private static List<float[]> findPositions(String keyword, PdfPageContentPositions pdfPageContentPositions) {
List<float[]> result = new ArrayList<>();
String content = pdfPageContentPositions.getContent();
List<float[]> charPositions = pdfPageContentPositions.getPositions();
for (int pos = 0; pos < content.length(); ) {
int positionIndex = content.indexOf(keyword, pos);
if (positionIndex == -1) {
break;
}
float[] postions = charPositions.get(positionIndex);
result.add(postions);
pos = positionIndex + 1;
}
return result;
}
private static class PdfPageContentPositions {
private String content;
private List<float[]> positions;
public String getContent() {
return content;
}
public void setContent(String content) {
this.content = content;
}
public List<float[]> getPositions() {
return positions;
}
public void setPostions(List<float[]> positions) {
this.positions = positions;
}
}
private static class PdfRenderListener implements RenderListener {
private int pageNum;
private float pageWidth;
private float pageHeight;
private StringBuilder contentBuilder = new StringBuilder();
private List<CharPosition> charPositions = new ArrayList<>();
public PdfRenderListener(int pageNum, float pageWidth, float pageHeight) {
this.pageNum = pageNum;
this.pageWidth = pageWidth;
this.pageHeight = pageHeight;
}
public void beginTextBlock() {
}
public void renderText(TextRenderInfo renderInfo) {
List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos();
for (TextRenderInfo textRenderInfo : characterRenderInfos) {
String word = textRenderInfo.getText();
if (word.length() > 1) {
word = word.substring(word.length() - 1, word.length());
}
Float rectangle = textRenderInfo.getAscentLine().getBoundingRectange();
float x = (float) rectangle.getX();
float y = (float) rectangle.getY();
//这两个是关键字在所在页面的XY轴的百分比
float xPercent = Math.round(x / pageWidth * 10000) / 10000f;
float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f;
CharPosition charPosition = new CharPosition(pageNum, (float) x, (float) y);
charPositions.add(charPosition);
contentBuilder.append(word);
}
}
public void endTextBlock() {
}
public void renderImage(ImageRenderInfo renderInfo) {
}
public String getContent() {
return contentBuilder.toString();
}
public List<CharPosition> getcharPositions() {
return charPositions;
}
}
private static class CharPosition {
private int pageNum = 0;
private float x = 0;
private float y = 0;
public CharPosition(int pageNum, float x, float y) {
this.pageNum = pageNum;
this.x = x;
this.y = y;
}
public int getPageNum() {
return pageNum;
}
public float getX() {
return x;
}
public float getY() {
return y;
}
@Override
public String toString() {
return "[pageNum=" + this.pageNum + ",x=" + this.x + ",y=" + this.y + "]";
}
}
}
5.aspose各种依赖包
aspose依赖