Java读取searchable pdf 内容生成 TXT文件

1. 实体类

package com.itstyle.seckill.common.classLoad;

public class ResultEntity {

    private String title;
    private Integer x1;
    private Integer x2;
    private Integer y1;
    private Integer y2;
    private float angle;
    private float fontSize;
    private float XDirAdj;
    private float YDirAdj;
    private Integer page;

    public ResultEntity(String title, Integer x1, Integer x2, Integer y1, Integer y2, float angle, float fontSize, float XDirAdj, float YDirAdj, Integer page) {
        this.title = title;
        this.x1 = x1;
        this.x2 = x2;
        this.y1 = y1;
        this.y2 = y2;
        this.angle = angle;
        this.fontSize = fontSize;
        this.XDirAdj = XDirAdj;
        this.YDirAdj = YDirAdj;
        this.page = page;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public Integer getX1() {
        return x1;
    }

    public void setX1(Integer x1) {
        this.x1 = x1;
    }

    public Integer getX2() {
        return x2;
    }

    public void setX2(Integer x2) {
        this.x2 = x2;
    }

    public Integer getY1() {
        return y1;
    }

    public void setY1(Integer y1) {
        this.y1 = y1;
    }

    public Integer getY2() {
        return y2;
    }

    public void setY2(Integer y2) {
        this.y2 = y2;
    }

    public float getAngle() {
        return angle;
    }

    public void setAngle(float angle) {
        this.angle = angle;
    }

    public float getFontSize() {
        return fontSize;
    }

    public void setFontSize(float fontSize) {
        this.fontSize = fontSize;
    }

    public float getXDirAdj() {
        return XDirAdj;
    }

    public void setXDirAdj(float XDirAdj) {
        this.XDirAdj = XDirAdj;
    }

    public float getYDirAdj() {
        return YDirAdj;
    }

    public void setYDirAdj(float YDirAdj) {
        this.YDirAdj = YDirAdj;
    }

    public Integer getPage() {
        return page;
    }

    public void setPage(Integer page) {
        this.page = page;
    }
}

  2.获取PDF 内容

  

package com.itstyle.seckill.common.classLoad;

import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class CustomPDFTextStripper  extends PDFTextStripper {
    public CustomPDFTextStripper() throws IOException {
    }
    private static final int BUFFER_SIZE_PIXEL= 5;
    private String getWordString(List<TextPosition> positionList,String str){
        int page = getCurrentPageNo();
        if(positionList.size()<=0){
            return  null;
        }
        TextPosition firstPosition = positionList.get(0);
        TextPosition lastPosition = positionList.get(positionList.size()-1);

        String rtnStr = null;

        if (firstPosition.getDir() > 0 && firstPosition.getDir()<91){
            rtnStr = String.format("[%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;]",
                    str,
                    firstPosition.getX() - firstPosition.getFontSizeInPt(),
                    firstPosition.getX(),
                    firstPosition.getY(),
                    lastPosition.getY() - firstPosition.getFontSizeInPt(),
                    firstPosition.getDir(),
                    firstPosition.getFontSizeInPt(),
                    firstPosition.getXDirAdj(),
                    firstPosition.getYDirAdj(),
                    page,
                    firstPosition.getFont());
        }else if(firstPosition.getDir() == 180){
            rtnStr = String.format("[%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;]",
                    str,
                    lastPosition.getEndX(),
                    firstPosition.getX(),
                    firstPosition.getY(),
                    firstPosition.getY() - firstPosition.getFontSize(),
                    firstPosition.getDir(),
                    firstPosition.getFontSize(),
                    firstPosition.getXDirAdj(),
                    firstPosition.getYDirAdj(),
                    page,
                    firstPosition.getFont());
        }else{
            rtnStr = String.format("[%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;%s;]",
                    str,
                    firstPosition.getX() ,
                    lastPosition.getEndY(),
                    firstPosition.getY()-firstPosition.getFontSizeInPt(),
                    firstPosition.getY(),
                    firstPosition.getDir(),
                    firstPosition.getFontSizeInPt(),
                    firstPosition.getXDirAdj(),
                    firstPosition.getYDirAdj(),
                    page,
                    firstPosition.getFont());
        }

        return  rtnStr;
    }
    @Override
    protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
        if (textPositions.size() == 1){
            TextPosition textPosition = textPositions.get(0);
            writeString(getWordString(textPositions,textPosition.getUnicode()));
        }else if (textPositions.size() > 1){
            TextPosition prevPosition = textPositions.get(0);
            TextPosition curPosition = null;
            float direction = prevPosition.getDir();
            StringBuffer resSB = new StringBuffer();
            StringBuffer tmpSB = new StringBuffer();
            tmpSB.append(prevPosition);
            List<TextPosition> tmpList = new ArrayList<TextPosition>();
            tmpList.add(prevPosition);

            float xInterval = Math.abs(prevPosition.getFontSizeInPt()) * 1.2f;
            for (int i = 0; i < textPositions.size(); i++) {
               curPosition = textPositions.get(i);

               if((direction ==180 && (prevPosition.getX() > curPosition.getX() || Math.abs(prevPosition.getY()-curPosition.getY())>BUFFER_SIZE_PIXEL))
               || (direction == 180 && (prevPosition.getX() < curPosition.getX() || Math.abs(prevPosition.getY() - curPosition.getY())>BUFFER_SIZE_PIXEL))
               || Math.abs(prevPosition.getX() - curPosition.getX()) > xInterval
               || prevPosition.getUnicode().equals(":") && curPosition.getUnicode().equals(" ")
               ){
                    if (prevPosition.getUnicode().equals(":") && curPosition.getUnicode().equals(" ")){
                        tmpList.remove(tmpList.size()-1);
                        tmpSB.deleteCharAt(tmpSB.length()-1);
                    }
                    String tmpStr = getWordString(tmpList,tmpSB.toString().replaceAll(";"," ").trim());
                    resSB.append(tmpStr);
                    tmpList = new ArrayList<TextPosition>();
                    tmpSB = new StringBuffer();
               }
               tmpList.add(curPosition);
               tmpSB.append(curPosition.getUnicode());
               prevPosition = textPositions.get(i);
            }
            resSB.append(getWordString(tmpList,tmpSB.toString().replaceAll(";"," ").trim()));
            writeString(resSB.toString());
        }
    }

}

 3.提取正确的信息

  

package com.itstyle.seckill.common.classLoad;

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;

import java.io.*;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Semaphore;

public class ExtractText {
    private static final int BUFFER_SIZE_PIXEL = 5;

    private static final int numThreads  = 8;

    private static  final Float ORIGINAL_DPI= 72f;

    private PDDocument document;
    private String pdfFilePath;
    private String getPdfFileName;
    private File destFile;
    private Float ratio;
    private Integer dpi;
    private Integer startPage;
    private Integer endPage;
    private Semaphore semaphore;
    private String destFileName;

    public ExtractText(PDDocument document, String pdfFilePath, String getPdfFileName, File destFile, Integer ratio, Integer dpi, Integer startPage, Integer endPage, Semaphore semaphore, String destFileName) {
        this.document = document;
        this.pdfFilePath = pdfFilePath;
        this.getPdfFileName = getPdfFileName;
        this.destFile = new File(destFileName);
        this.ratio = Float.valueOf(Float.valueOf(dpi) / ORIGINAL_DPI);
        this.dpi = dpi;
        this.startPage = startPage;
        this.endPage = endPage;
        this.semaphore = semaphore;
        this.destFileName = destFileName;
    }
    public boolean extract(String imgfilePathPrefix) {
        boolean result = false;

        BufferedWriter bw = null;

        try {
            document = PDDocument.load(new File(pdfFilePath));
            String lineSep = System.getProperty("line.separator");

            int numPages = document.getNumberOfPages();
            if (endPage > numPages) {
                endPage = numPages;
            }
            PDFRenderer pdfRenderer = new PDFRenderer(document);
            List<ResultEntity> textToAddList = new ArrayList<ResultEntity>();
            PDFTextStripper stripper = new CustomPDFTextStripper();
            stripper.setSortByPosition(false);

            final ExecutorService executorService = Executors.newFixedThreadPool(numThreads);
            semaphore = new Semaphore(numThreads);

            for (int i = startPage; i <= endPage; i++) {
                stripper.setStartPage(i);
                stripper.setEndPage(i);
                String text = stripper.getText(document);
                if (text != null && text.length() > 5) {
                    if (destFile.getParentFile() != null) {
                        File destinationFilePath = new File(destFile.toString());
                        if (!destinationFilePath.exists()) {
                            destinationFilePath.mkdirs();
                        }
                    }
                    StringBuffer imgFilePath = new StringBuffer();
                    imgFilePath.append(destFile);
                    imgFilePath.append(File.separator + getPdfFileName);
                    imgFilePath.append("_");
                    imgFilePath.append(i);
                    imgFilePath.append(".txt");
                    bw = new BufferedWriter(new OutputStreamWriter(new FileOutputStream((new File(imgFilePath.toString()))), "utf-8"));

                    text = text.replaceAll(lineSep, "==");
                    text = text.replaceAll("\n", "");
                    text = text.replaceAll("] \\[", "]\\[");
                    text = text.replaceAll("\\[", "");
                    String[] arrStrLine = text.split("==");
                    for (String str : arrStrLine) {
                        String[] arrStr = str.split("]");
                        List<String[]> strArrList = new ArrayList<String[]>();
                        StringBuffer dirStrBuf = new StringBuffer();
                        List<ResultEntity> infoList = new ArrayList<ResultEntity>();
                        float prevX = -9999f;
                        float prevY = -9999f;
                        float yInterval = -9999f;

                        if (arrStr.length > 0) {
                            for (String tmpStr : arrStr) {
                                if (tmpStr.length() > 0) {
                                    String[] arrStr2 = tmpStr.split(";");
                                    if (arrStr2.length > 6) {
                                        float dir = Float.valueOf(arrStr2[5]);
                                        if ((dir > 0 && dir < 91) && prevX < 0 && dirStrBuf.length() == 0) {
                                            prevX = Float.valueOf(arrStr2[1]);
                                            prevY = Float.valueOf(arrStr2[4]);
                                            yInterval = Math.abs(Float.valueOf(arrStr2[6]) * 2);
                                        }
                                        if ((dir > 0 && dir < 91)
                                                && Math.abs(prevX - Float.valueOf(arrStr2[1])) <= BUFFER_SIZE_PIXEL
                                                && Math.abs(Float.valueOf(arrStr2[3]) - prevY) < yInterval
                                                && strArrList.size() >= 0) {
                                            strArrList.add(arrStr2);
                                            dirStrBuf.append(arrStr2[0]);
                                            prevY = Float.valueOf(arrStr2[4]);
                                            continue;
                                        } else if ((dir < 91) && ((Math.abs(prevX - Float.valueOf(arrStr2[1])) > BUFFER_SIZE_PIXEL
                                                || Math.abs(Float.valueOf(arrStr2[3]) - prevY) >= yInterval))
                                                && strArrList.size() > 0) {
                                            String[] tmpArrStr = new String[10];
                                            tmpArrStr[0] = dirStrBuf.toString();
                                            tmpArrStr[1] = strArrList.get(0)[1];
                                            tmpArrStr[2] = strArrList.get(0)[2];
                                            tmpArrStr[3] = strArrList.get(0)[3];
                                            tmpArrStr[4] = strArrList.get(strArrList.size() - 1)[4];
                                            tmpArrStr[5] = strArrList.get(0)[5];
                                            tmpArrStr[6] = strArrList.get(0)[6];
                                            tmpArrStr[7] = strArrList.get(0)[7];
                                            tmpArrStr[8] = strArrList.get(0)[8];
                                            tmpArrStr[9] = strArrList.get(0)[9];

                                            strArrList = new ArrayList<String[]>();
                                            dirStrBuf = new StringBuffer();
                                            prevX = -9999f;
                                            prevY = -9999f;
                                            yInterval = -9999f;
                                            infoList = addArray2List(infoList, tmpArrStr);

                                            if (dir > 0 && dir < 91) {
                                                strArrList.add(arrStr2);
                                                dirStrBuf.append(arrStr2[0]);
                                                prevX = Float.valueOf(arrStr2[1]);
                                                prevY = Float.valueOf(arrStr2[4]);
                                                yInterval = Math.abs(Float.valueOf(arrStr2[6]) * 2);
                                            } else {
                                                infoList = addArray2List(infoList,arrStr2);
                                            }
                                        }else {
                                            infoList = addArray2List(infoList,arrStr2);
                                        }
                                    }
                                }
                            }
                            if (infoList !=null && infoList.size() > 0){
                                infoList.sort(Comparator.comparing(ResultEntity::getY1).thenComparing(ResultEntity::getX1));
                                for (int k = 0; k < infoList.size(); k++) {
                                    ResultEntity entity = infoList.get(k);
                                    textToAddList.add(entity);
                                    bw.write("|"+"|"+
                                            entity.getX1()+"|"+
                                            entity.getY1()+"|"+
                                            entity.getX2()+"|"+
                                            entity.getY2()+"|"+
                                            entity.getTitle()+"|"+
                                            entity.getAngle()+"|"+
                                            entity.getFontSize()+"|"+"|");
                                    if (k != infoList.size()-1){
                                        bw.write(",");
                                    }
                                }
                            }
                        }
                        bw.write(System.getProperty("line.separator"));
                    }
                    if (bw != null){
                        bw.close();
                    }

                }
            }
            /**************************检查searchable pdf 生成单个图片时系统是否包含所需要的字体***********************************/
            PDDocument pdDocument = PDDocument.load(new File(pdfFilePath));
            PDFCheckFont pdfCheckFont = new PDFCheckFont();
            pdfCheckFont.removeText(pdDocument);
            pdfCheckFont.addText(pdDocument,textToAddList);
            pdfCheckFont.makeImages(pdDocument,imgfilePathPrefix);
            /**************************检查searchable pdf 生成单个图片时系统是否包含所需要的字体***********************************/
           executorService.shutdown();
           result = true;
        } catch (Exception e) {
            e.fillInStackTrace();
        } finally {
            try {
                if (bw != null){
                    bw.close();
                }
                while (semaphore.availablePermits() != numThreads) {
                    Thread.sleep(1000);
                }
                if (document != null){
                    document.close();
                }
            } catch (IOException e) {
                e.printStackTrace();
            }catch (InterruptedException e) {
                e.printStackTrace();
            }

        }
        return  result;
    }
    private List<ResultEntity> addArray2List(List<ResultEntity> infoList,String [] strArray){
        swap(strArray);
        String title =strArray[0];
        if(strArray[1] == null || strArray[1].length() == 0){
            strArray[1] = "0";
        }
        if(strArray[2] == null || strArray[2].length() == 0){
            strArray[2] = "0";
        }
        if(strArray[3] == null || strArray[3].length() == 0){
            strArray[3] = "0";
        }
        if(strArray[4] == null || strArray[4].length() == 0){
            strArray[4] = "0";
        }
        int x1 = (int) (Float.valueOf(strArray[1]) * ratio);
        int x2 = (int) (Float.valueOf(strArray[2]) * ratio);
        int y1 = (int) (Float.valueOf(strArray[3]) * ratio);
        int y2 = (int) (Float.valueOf(strArray[4]) * ratio);
        if (infoList == null){
            infoList = new ArrayList<ResultEntity>();
        }
        ResultEntity resultEntity = new ResultEntity(title,x1,y1,x2,y2,Float.valueOf(strArray[5]),Float.valueOf(strArray[6]),Float.valueOf(strArray[7]),Float.valueOf(strArray[8]),Integer.valueOf(strArray[9]));
        infoList.add(resultEntity);
        return infoList;
    }
    private String[] swap(String[] strArray){
        String x1 = strArray[1];
        String x2 = strArray[2];
        String y1 = strArray[3];
        String y2 = strArray[4];
        if (strArray[1] != null && !strArray[1].equals("")
           && strArray[2] != null &&  !strArray[2].equals("")
           && Float.valueOf(strArray[1]) > Float.valueOf(strArray[2])){
            strArray[2] = x1;
            strArray[1] = x2;
        }
        if (strArray[3] != null && !strArray[3].equals("")
                && strArray[4] != null &&  !strArray[4].equals("")
                && Float.valueOf(strArray[3]) > Float.valueOf(strArray[4])){
            strArray[4] = y1;
            strArray[3] = y2;
        }

        return  strArray;
    }
}

4.检查searchable pdf 生成单个图片时系统是否包含所需要的字体

package com.itstyle.seckill.common.classLoad;

import org.apache.pdfbox.contentstream.operator.Operator;
import org.apache.pdfbox.cos.COSArray;
import org.apache.pdfbox.cos.COSString;
import org.apache.pdfbox.multipdf.LayerUtility;
import org.apache.pdfbox.pdfparser.PDFStreamParser;
import org.apache.pdfbox.pdfwriter.ContentStreamWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.PDPageTree;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType0Font;
import org.apache.pdfbox.pdmodel.graphics.form.PDFormXObject;
import org.apache.pdfbox.rendering.PDFRenderer;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.springframework.util.StringUtils;

import javax.imageio.ImageIO;
import java.awt.geom.AffineTransform;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.io.OutputStream;
import java.util.ArrayList;
import java.util.List;


/**
 * 检查searchable pdf 生成单个图片时系统是否包含所需要的字体
 */
public class PDFCheckFont {

    public void addText(PDDocument pdDocument,List<ResultEntity> textToAddList) throws Exception{
        int pages = pdDocument.getNumberOfPages();
        PDDocument newDocument = new PDDocument();
        LayerUtility layerUtility = new LayerUtility(pdDocument);
        AffineTransform affineTransform = new AffineTransform();
        for (int i = 0; i < pages; i++) {
            PDType0Font font = PDType0Font.load(pdDocument,new File("../../usr/share/fonts/NamumGothic.tif"));
            PDPage page = pdDocument.getPage(i);
            float height =  page.getMediaBox().getHeight();
            PDRectangle rec = new PDRectangle(page.getMediaBox().getWidth(),height);
            PDPage newPage = new PDPage(rec);
            newDocument.addPage(newPage);
            PDPageContentStream contentStream = new PDPageContentStream(newDocument,newPage);
            for (ResultEntity textToAdd : textToAddList){
                if (textToAdd.getPage() == i+1 && StringUtils.isEmpty(textToAdd.getTitle())){
                    contentStream.beginText();
                    contentStream.setFont(font,textToAdd.getFontSize());
                    contentStream.newLineAtOffset(textToAdd.getXDirAdj(),height - textToAdd.getYDirAdj());
                    contentStream.showText(textToAdd.getTitle());
                    contentStream.endText();
                }
            }
            contentStream.close();
            PDFormXObject pdFormXObject = layerUtility.importPageAsForm(newDocument,newPage);
            layerUtility.appendFormAsLayer(page,pdFormXObject,affineTransform,"aicrTextLayer"+(i+1));

        }
        newDocument.close();
    }


    public void makeImages(PDDocument pdDocument,String imgFilePathPrefix) throws Exception{
        StringBuffer imgFilePath = null;
        int pages = pdDocument.getNumberOfPages();
        String[] fileNames = new String[pages];

        PDFRenderer renderer = new PDFRenderer(pdDocument);
        for (int i = 0; i < pages; i++) {
            imgFilePath = new StringBuffer();
            imgFilePath.append(imgFilePathPrefix);
            imgFilePath.append("_");
            imgFilePath.append(i+1);
            imgFilePath.append(".jpg");
            File dstFile = new File(imgFilePath.toString());
            BufferedImage image = renderer.renderImageWithDPI(i,200);
            ImageIO.write(image,"jpg",dstFile);
            fileNames[i] = dstFile.getName();

        }
    }
    public void removeText(PDDocument pdDocument) throws  Exception{
        PDPageTree pages = pdDocument.getDocumentCatalog().getPages();
        for (PDPage page: pages){
            PDFStreamParser parser = new PDFStreamParser(page);
            parser.parse();
            List tokens = parser.getTokens();
            for (int i = 0; i < tokens.size(); i++) {
                Object next = tokens.get(i);
                if (next instanceof Operator){
                    Operator op = (Operator) next;
                    if (op.getName().equals("Tj")){
                        COSString previous = (COSString) tokens.get(i-1);
                        previous.setValue("".getBytes());
                    }else if (op.getName().equals("TJ")){
                        COSArray previous = (COSArray)tokens.get(i-1);
                        for (int k = 0; k < previous.size(); k++) {
                            Object arrElement = previous.getObject(k);
                            if (arrElement instanceof COSString){
                                COSString cosString = (COSString) arrElement;
                                cosString.setValue("".getBytes());
                            }
                        }
                    }
                }
            }
            PDStream pdStream = new PDStream(pdDocument);
            OutputStream out = pdStream.createOutputStream();
            ContentStreamWriter contentStreamWriter = new ContentStreamWriter(out);
            contentStreamWriter.writeTokens(tokens);
            out.close();
            page.setContents(pdStream);
        }
    }

    public static  class MyTextStripper extends PDFTextStripper{
        public MyTextStripper() throws IOException {
            super();
        }
        List<TextAdd> textList = new ArrayList<>();
        public List<TextAdd> getTextList(){
            return textList;
        }
        @Override
        protected void writeString(String text, List<TextPosition> textPositions) throws IOException{
            int page = getCurrentPageNo();
            float fontSize = textPositions.get(0).getFontSize();
            float xPosition = textPositions.get(0).getXDirAdj();
            float yPosition = textPositions.get(0).getYDirAdj();
            PDFont font = textPositions.get(0).getFont();
            StringBuffer tmpSB = new StringBuffer();
            for (TextPosition textPosition : textPositions){
                tmpSB.append(textPosition.getUnicode());
            }
            textList.add(new TextAdd(page,tmpSB.toString(),fontSize,xPosition,yPosition,font));
        }
    }

    public static class TextAdd{
        int page;
        String text;
        float fontSize,xPosition,yPosition;
        PDFont font;

        public TextAdd(int page, String text, float fontSize, float xPosition, float yPosition, PDFont font) {
            this.page = page;
            this.text = text;
            this.fontSize = fontSize;
            this.xPosition = xPosition;
            this.yPosition = yPosition;
            this.font = font;
        }
    }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值