由于原来使用的是itext5 导致有些PDF的格式类型不支持 解析失败
所以想升级到7,目前我的测试版本是7.0.3 发现有些实体类都没有了,心累 比对了新老版本 不同的实体类和实现逻辑 终于找到一点点相关的信息
非常感谢 使用iText 7读取PDF文件中的文本和图片 - 佛西亚 - 博客园 的回答,让我有点领悟到了真谛。
博文里面提到的是获取整个page的String文本,但是我这里需求是按照行读,所以尝试在原来的基础上小改动, 具体的逻辑如下 省略了一些不关键的代码
官网地址如下: iText 7 Community | iText PDF
main(){
imageTool.readEnPdfData("D:/33.pdf");
}
public Map readEnPdfData(String filePath) throws Exception {
PDDocument document = null;
List<String> contentList = CollUtil.newArrayList();
String errorMsg = "系统错误";
try {
boolean sort = true;// 是否排序
int startPage = 1;// 开始提取页数
int endPage = Integer.MAX_VALUE;// 结束提取页数
URL url = URLUtil.url(filePath);
InputStream inputStream = URLUtil.getStream(url);
document =PDDocument.load(inputStream);
PDFTextStripper pts = new PDFTextStripper();
pts.setSortByPosition(sort);
endPage = document.getNumberOfPages();
// System.out.println("Total Page: " + endPage);
pts.setStartPage(startPage);
pts.setEndPage(endPage);
String content = pts.getText(document);
String[] contents = StrUtil.split(content,"\n");
contentList = Convert.toList(String.class, contents);
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
List<Map<String, Rectangle2D.Float>> xyAllList = new PdfCoordinateKit().readXY(filePath);
Rectangle2D.Float ingredientsXY = null, inciXY = null,byWeightXY = null,functionXY = null,specificationsXY = null,heightWidthXY = null ;
for (Map<String, Rectangle2D.Float> m : xyAllList) {
if (ObjectUtil.isNotNull(m.get("a")) && ObjectUtil.isNull(ingredientsXY)) {
ingredientsXY = m.get("a");
// 省略若干行解析获得数据并存储
}
}
if (specificationsXY.y >= ingredientsXY.y) { specificationsXY.setRect(0, 0, 0, 0); // 当Specifications跨行变成0 }
// System.out.println("============获得数据结果集================="); List<String> dataList = this.getTextByRectangle(stripper,firstPage,ingredientsXY,heightWidthXY, Math.round(inciXY.x - ingredientsXY.x), Math.round(ingredientsXY.y - specificationsXY.y - ingredientsXY.height - ingredientsXY.height));
}
Class:
PdfCoordinateKit
public List<Map<String, Rectangle2D.Float>> readXY(String filePath){
List<Map<String,Rectangle2D.Float>> list_text_all=new ArrayList<Map<String,Rectangle2D.Float>> ();
try{
PdfReader reader = new PdfReader(filePath);
PdfDocument redDocument = new PdfDocument(reader);
PdfPage onePage = redDocument.getPage(1); //得到pdf整页
Rectangle ps = onePage.getPageSize();
float height = ps.getHeight();
float width = ps.getWidth();
Map<String,Rectangle2D.Float> mapWH = new HashMap<String,Rectangle2D.Float>();
Rectangle2D.Float f = new Rectangle2D.Float();
f.setFrame(0, 0, width, height);
mapWH.put("heightWidth$", f);
list_text_all.add(mapWH);
//2、try two 采用bufferedReader
PdfDocumentContentParser parser = new PdfDocumentContentParser(redDocument);
for(int i = 1;i <= redDocument.getNumberOfPages();i++) {
//实现具体的listener
TextRenderListener listener = new TextRenderListener();
//获取文字的矩形边框
List<Map<String,Rectangle2D.Float>> list_text = listener.rows_text_rect;
for(int k = 0;k < list_text.size();k++){
list_text_all.add(list_text.get(k));
}
}
}catch(Exception e){
}
return list_text_all;
}
// itext5 和 itext7的解析类 都不太一样,7要实现自己的EventLister
public class TextRenderListener implements IEventListener {
//用来存放文字的矩形
List<Rectangle2D.Float> rectText = new ArrayList<Rectangle2D.Float>();
//用来存放文字
List<String> textList = new ArrayList<String>();
//用来存放文字的y坐标
List<Float> listY = new ArrayList<Float>();
//用来存放每一行文字的坐标位置
List<Map<String,Rectangle2D.Float>> rows_text_rect = new ArrayList<Map<String,Rectangle2D.Float>>();
@Override
public void eventOccurred(IEventData data, EventType type)
{
TextRenderInfo renderInfo;
if (data instanceof TextRenderInfo)
{
renderInfo = (TextRenderInfo)data;
try
{
String text = renderInfo.getText();
if(text.length() > 0){
Rectangle rectBase = renderInfo.getBaseline().getBoundingRectangle();
//获取文字下面的矩形
Rectangle rectAscen = renderInfo.getAscentLine().getBoundingRectangle();
//计算出文字的边框矩形
float leftX = rectBase.getX();
float leftY = rectBase.getY()-1;
float rightX = rectAscen.getX()+rectAscen.getWidth();
float rightY = rectAscen.getY()+rectAscen.getHeight()+1;
Rectangle2D.Float rect = new Rectangle2D.Float(leftX, leftY, rightX - leftX, rightY - leftY);
if(listY.contains(rect.y)){
int index = listY.indexOf(rect.y);
float tempx = rect.x > rectText.get(index).x ? rectText.get(index).x : rect.x;
rectText.set(index,new Rectangle2D.Float(tempx,rect.y,rect.width + rectText.get(index).width,rect.height));
textList.set(index,textList.get(index) + text);
}else{
rectText.add(rect);
textList.add(text);
listY.add(rect.y);
}
Map<String,Rectangle2D.Float> map = new HashMap<String,Rectangle2D.Float>();
map.put(text,rect);
rows_text_rect.add(map);
}
}
catch(Exception e)
{
}
}
}
@Override
public Set<EventType> getSupportedEvents(){
return null;
}