提取pdf文字坐标再pdf转换成对应图片大小像素坐标
pdfbox:2.0.15提取pdf文字
package com.exampl.pdf;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.*;
import java.util.ArrayList;
import java.util.List;
public class PdfPage extends PDFTextStripper {
private int dip = 96; //图片dpi
private int i = 0; //行号
private int count = 0; //顺序号
private int pageinde; //页码
private boolean separator;
private Page page = new Page(); //存储结果集
private List> TextPositionLine = new ArrayList<>();
private List> lastTextPositionLine = new ArrayList<>();
private List>> TextPosition = new ArrayList<>();
public Page getPage() {
writeText();
return page;
}
public void setPage(Page page) {
this.page = page;
}
public PdfPage(int pageinde, String pdf) throws Exception {
this.pageinde = pageinde;
pdfIndex(pageinde, pdf);
}
void pdfIndex(int page, String path) throws Exception {
PDDocument pdd = PDDocument.load(new File(path));
this.setSortByPosition(true);
this.stripPage(pdd, page);
OutputStream byteArrayOutputStream = new FileOutputStream(new File("D:\\img\\" + page + ".jpg"));
PDFRenderer renderer = new PDFRenderer(pdd);
BufferedImage image = renderer.renderImageWithDPI(page, this.dip);
ImageIO.write(image, "JPG", byteArrayOutputStream);
byteArrayOutputStream.close();
byteArrayOutputStream.flush();
}
//查找每行坐标文字
void stripPage(PDDocument pdd, int pageNr) throws IOException {
this.setStartPage(pageNr + 1);
this.setEndPage(pageNr + 1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
writeText(pdd, dummy);
}
//计算每个文字坐标大小并保存到words和rows
@Override
protected void writeString(String string, List textPositions) {
if (separator) { //separator判断是否换行
TextPosition.add(TextPositionLine);
TextPositionLine = new ArrayList<>();
}
separator = true;
TextPositionLine.add(textPositions);
lastTextPositionLine = TextPositionLine;
}
@Override
protected void writeWordSeparator() {
separator = false; //如果换行会调用writeWordSeparator方法设置separator变量
}
void writeText() {
TextPosition.add(lastTextPositionLine);//最后一行文字添加进去
for (List> lists : TextPosition) {
List row = new ArrayList<>();
for (List line : lists) {
for (TextPosition text : line) {
PdfJson pdfJson = new PdfJson();
//乘以dip除以72转换成图片坐标位置
pdfJson.setX1(text.getX() * this.dip / 72); //左x坐标
pdfJson.setX2(text.getEndX() * this.dip / 72); //右x坐标
pdfJson.setY1(text.getY() * this.dip / 72); //下y坐标
pdfJson.setY2((text.getY() * this.dip / 72) - text.getFontSize() - 5); //上y坐标 由于拿出来文字高端矮了一点 减5提高文字高度
pdfJson.setText(text.getUnicode());
pdfJson.setRowIndex(i); //设置行号
pdfJson.setWordIndex(line.indexOf(text)); //设置每行当前文字在第几位
pdfJson.setIndex(this.count); //当前文字在所有文字中排第几位
pdfJson.setPageIndex(this.pageinde); //设置pdf页码
page.getWords().add(pdfJson);
row.add(pdfJson);
this.count++;
}
}
page.getRows().add(row);
this.i++;
}
}
}
重写writeLine方法里面writeString和writeWordSeparator方法大家可以研究下逻辑
贴上前端展示效果由于pdf涉密处理掉了部分
点赞
收藏
分享
文章举报
都来砍我
发布了1 篇原创文章 · 获赞 0 · 访问量 30
私信
关注