pdf解析,转图的代码小记

1.之前写pdf文件解析的做过一些pdf发票的解析,但是pdf的发票的类型比较多,来源也很多,有正常的,有图片转pdf的,有其他类型文件转换的,在这里记一下之前开发的一些代码片段

2.导入pom.xml 

<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>3.0.2</version>
</dependency>

<dependency>
    <groupId>com.itextpdf</groupId>
    <artifactId>itext7-core</artifactId>
    <version>7.1.1</version>
    <type>pom</type>
</dependency>

3.pdf和图片的相互操作

 


import com.alibaba.fastjson.JSONObject;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Image;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.pdf.PdfWriter; 
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;

/**
 * @description: 图片转PDF
 **/
public class ImageToPdf {

    public static void main(String[] args) {
        imageToPdf("E:\\Desktop\\pdf\\bb.pdf",
                "E:\\Desktop\\99.jpg,E:\\Desktop\\3_2.jpg,E:\\Desktop\\6.jpg,E:\\Desktop\\测试2.jpg");
        System.out.println("200!ok");
    }

    /**
     * @description: 图片转pdf
     * @author: wanJh
     * @date: 2022-9-19 15:56
     * @param: [文件路径, 图片路径,多图片已“,”分隔]
     * @return: void
     **/
    public static JSONObject imageToPdf(String filepath, String imgUrl) {
        JSONObject returnResult = new JSONObject();
        try {
            //图片list集合
            ArrayList<String> imageUrllist = new ArrayList<String>();
            String[] imgUrls = imgUrl.split(",");
            for (int i = 0; i < imgUrls.length; i++) {
                imageUrllist.add(imgUrls[i]);
            }
            //输出pdf文件路径
            String pdfUrl = filepath;
            //生成pdf
            File file = PdfBox(imageUrllist, pdfUrl);
            String fileSize = String.valueOf(file.length());
            file.createNewFile();
            returnResult.put("code", ResultCode.SUCCESS.getCode());
            returnResult.put("msg", "PDF合成成功");
            returnResult.put("fileSize", fileSize);
        } catch (IOException e) {
            returnResult.put("code", ResultCode.ERROR.getCode());
            returnResult.put("msg", "PDF合成失败");
            e.printStackTrace();
        }
        return returnResult;
    }

    /**
     * 多图片合成PDF 使用pdfbox中的画笔
     *
     * @param imageUrllist
     * @param mOutputPdfFileName
     * @return
     */
    public static File PdfBox(ArrayList<String> imageUrllist, String mOutputPdfFileName) {
        try {
            PDDocument document = new PDDocument();
            //循环图片List,将图片加入到pdf中
            for (int i = 0; i < imageUrllist.size(); i++) {
                BufferedImage imageStream = ImageIO.read(new File(imageUrllist.get(i)));
                int width = imageStream.getWidth();
                int height = imageStream.getHeight();
                PDPage page = new PDPage(new PDRectangle(width, height));
                document.addPage(page);

                PDImageXObject image = LosslessFactory.createFromImage(document, imageStream);
                PDPageContentStream contentStream = new PDPageContentStream(document, page,PDPageContentStream.AppendMode.APPEND, false);
                contentStream.drawImage(image, 0, 0, width, height);
                contentStream.close();
            }
            document.save(mOutputPdfFileName);
            document.close();
        } catch (Exception e) {
            e.printStackTrace();
        }
        //输出流
        File mOutputPdfFile = new File(mOutputPdfFileName);
        if (!mOutputPdfFile.exists()) {
            mOutputPdfFile.deleteOnExit();
            return null;
        }
        //返回文件输出流
        return mOutputPdfFile;
    }
    /**
     * 多图片合成PDF
     *
     * @param imageUrllist
     * @param mOutputPdfFileName
     * @return
     */
    public static File Pdf(ArrayList<String> imageUrllist, String mOutputPdfFileName) {
        //new一个pdf文档
        Document doc = new Document(PageSize.A4, 0, 0, 0, 0);
        try {
            //pdf写入
            PdfWriter.getInstance(doc, new FileOutputStream(mOutputPdfFileName));
            //打开文档
            doc.open();
            //循环图片List,将图片加入到pdf中
            for (int i = 0; i < imageUrllist.size(); i++) {
                //在pdf创建一页
                if (i == 0)
                    doc.newPage();
                //通过文件路径获取image
                Image png1 = Image.getInstance(imageUrllist.get(i));
                float heigth = png1.getHeight();
                float width = png1.getWidth();
                int percent = getPercent(heigth, width);
                png1.setAlignment(Image.MIDDLE);
                // 表示是原来图像的比例;
                png1.scalePercent(percent + 3);
                doc.add(png1);

            }
            doc.close();
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (DocumentException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        }
        //输出流
        File mOutputPdfFile = new File(mOutputPdfFileName);
        if (!mOutputPdfFile.exists()) {
            mOutputPdfFile.deleteOnExit();
            return null;
        }
        //反回文件输出流
        return mOutputPdfFile;
    }

    public static int getPercent(float h, float w) {
        int p = 0;
        float p2 = 0.0f;
        if (h > w) {
            p2 = 600 / h * 100;
        } else {
            p2 = 530 / w * 100;
        }
        p = Math.round(p2);
        return p;
    }

    public static int getPercent2(float h, float w) {
        int p = 0;
        float p2 = 0.0f;
        p2 = 530 / w * 100;
        p = Math.round(p2);
        return p;
    }
}
 

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject; 
import net.coobird.thumbnailator.Thumbnails;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

public class PdfToImage {
    public static void main(String[] args) throws IOException {
        String a="1.AAAA.pdf";
        String b=a.substring(0, a.lastIndexOf("."));
        System.out.println(b);
    }
    /**
     * @description: 把pdf文件中的所有图片转成图片,提取里面的每一张图片
     * @author: wanJh
     * @date: 2022-9-19 16:06
     * @param: [fileAddress, filename, indexOfStart, imagetype]
     * @return: com.alibaba.fastjson.JSONArray
     **/
  /*  public static JSONArray pdfToImage(String fileFloder, String file_id, String filename, int indexOfStart, String type) throws IOException {
        filename =filename.substring(0, filename.lastIndexOf("."));
        JSONArray returnObj = new JSONArray();
        PdfDocument pdf = new PdfDocument();
        pdf.loadFromFile(fileFloder+file_id+"_0");
        // 按每页获取文件内容
        PdfPageCollection pages = pdf.getPages();
        // pdf每页对象
        PdfPageBase page;
        int j=1;
        if (pages.getCount() > 0) {
            for (int i = 0; i < pages.getCount(); i++) {
                page = pages.get(i);
                BufferedImage[] bufferedImages = page.extractImages(true);
                if (ObjectUtils.isNotEmpty(bufferedImages)) {
                    for (BufferedImage bufferedImage : bufferedImages) {
                        String imageName=filename+"_"+j+"."+type;
                        String newFileId = TecrunUtils.createFileId();
                        String imagePath=fileFloder+newFileId+"_0";
                        String thumbImagePath=fileFloder+newFileId+"_1.jpg";
                        ImageIO.write(bufferedImage, type, new File(imagePath));
                        ImageIO.write(bufferedImage, type, new File(thumbImagePath));
                        JSONObject imageObj =new JSONObject();
                        imageObj.put("fileName",imageName);
                        imageObj.put("filePath",imagePath);
                        imageObj.put("fileId",newFileId);
                        returnObj.add(imageObj);
                        j++;
                    }
                }
            }
        }
        return returnObj;
    }*/

    /**
     * @description: 把pdf按页转成图片
     * @author: wanJh
     * @date: 2022-9-19 16:22
     * @param: [fileAddress, filename, indexOfStart, imagetype]
     * @return: void
     **/
    public static JSONArray pdfToImageByPage(String fileFloder, String file_id, String filename, int indexOfStart, String type) {
        // 将pdf装图片 并且自定义图片得格式大小
        filename =filename.substring(0, filename.lastIndexOf("."));
        JSONArray returnObj = new JSONArray();
        File file = new File(fileFloder+file_id+"_0");
        try {
            PDDocument doc = Loader.loadPDF(file);
            PDFRenderer renderer = new PDFRenderer(doc);
            int j=1;
            int pageCount = doc.getNumberOfPages();
            for (int i = indexOfStart; i < pageCount; i++) {
                BufferedImage image = renderer.renderImageWithDPI(i, 144); // Windows native DPI
                //BufferedImage thumbImage = resize(image, 240, 240);//产生缩略图
                String imageName=filename+"_"+j+"."+type;
                String newFileId =TecrunUtils.createFileId();
                String imagePath=fileFloder+newFileId+"_0";
                String thumbImagePath=fileFloder+newFileId+"_1.jpg";
                ImageIO.write(image, type, new File(imagePath));
                ThumbUtil.zoom(imagePath, thumbImagePath, "png");
                //ImageIO.write(image, type, new File(thumbImagePath));
                JSONObject imageObj =new JSONObject();
                imageObj.put("imageName",imageName);
                imageObj.put("imagePath",imagePath);
                imageObj.put("fileId",newFileId);
                imageObj.put("fileSize","");
                returnObj.add(imageObj);
                j++;
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return returnObj;
    }

    //pdf文件所有也都转图
    public static JSONArray PdfToImageByPage(String rootPath, String file_id, String batchId, String filename, String type) {
        // 将pdf装图片 并且自定义图片得格式大小
        filename =filename.substring(0, filename.lastIndexOf("."));
        JSONArray returnObj = new JSONArray();
        String filepath =TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, file_id, 0);
        try{
            PDDocument doc = Loader.loadPDF(new File(filepath));
            PDFRenderer renderer = new PDFRenderer(doc);
            int pageCount = doc.getNumberOfPages();
            for (int i = 0; i < pageCount; i++) {
                // 4. 指定页码转换图片
                BufferedImage image = renderer.renderImageWithDPI(i, 144); // Windows native DPI
                String newFileId = TecrunUtils.createFileId();
                String imageName=filename+"_"+i+"."+type;
                String imagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 0);
                String thumbImagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 1);
//                String imagePath=fileFloder+newFileId+"_0";
//                String thumbImagePath=fileFloder+newFileId+"_1.jpg";
                Path dist = Paths.get(imagePath);
                // 5. 存储为指定格式图片
                ImageIO.write(image, "JPEG", dist.toFile());
                ///生成每一页的缩略图
                Thumbnails.of(imagePath).size(200, 200)
                        .outputFormat("jpg").toFile(thumbImagePath);
                JSONObject ofdimage=new JSONObject();
                ofdimage.put("file_id",newFileId);
                ofdimage.put("imagePath",imagePath);
                ofdimage.put("thumbImagePath",thumbImagePath);
                ofdimage.put("file_name",imageName);
                ofdimage.put("file_size", new File(imagePath).length());
                ofdimage.put("file_suffix",type);
                returnObj.add(ofdimage);

            }
        }catch(Exception e){
            e.printStackTrace();
        }
        return returnObj;
    }

    //pdf文件第一页转图
    public static JSONObject PdfToImageByPageOne(String rootPath, String file_id, String batchId, String filename, String type) {
        // 将pdf装图片 并且自定义图片得格式大小
        filename =filename.substring(0, filename.lastIndexOf("."));
        JSONObject ofdimage=new JSONObject();
//        JSONArray returnObj = new JSONArray();
        String filepath =TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, file_id, 0);
        try{
            PDDocument doc = Loader.loadPDF(new File(filepath));
            PDFRenderer renderer = new PDFRenderer(doc);
            int pageCount = doc.getNumberOfPages();
            BufferedImage image = renderer.renderImageWithDPI(0, 144); // Windows native DPI
            // 4. 指定页码转换图片
            String newFileId = TecrunUtils.createFileId();
            String imageName=filename+"_"+0+"."+type;
            String imagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 0);
            String thumbImagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 1);
//                String imagePath=fileFloder+newFileId+"_0";
//                String thumbImagePath=fileFloder+newFileId+"_1.jpg";
            Path dist = Paths.get(imagePath);
            // 5. 存储为指定格式图片
            ImageIO.write(image, "JPEG", dist.toFile());
            ///生成每一页的缩略图
            Thumbnails.of(imagePath).size(200, 200)
                    .outputFormat("jpg").toFile(thumbImagePath);

            ofdimage.put("file_id",newFileId);
            ofdimage.put("imagePath",imagePath);
            ofdimage.put("thumbImagePath",thumbImagePath);
            ofdimage.put("file_name",imageName);
            ofdimage.put("file_size", new File(imagePath).length());
            ofdimage.put("file_suffix",type);
        }catch(Exception e){
            e.printStackTrace();
        }
        return ofdimage;
    }

    //pdf文件从那一页转图
    public static JSONArray pdfToImageByPageByIndex(String rootPath, String file_id, String batchId, String filename, String type,int Index) {
        // 将pdf装图片 并且自定义图片得格式大小
        filename =filename.substring(0, filename.lastIndexOf("."));
        JSONArray returnObj = new JSONArray();
        String filepath =TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, file_id, 0);
        try{
            PDDocument doc = Loader.loadPDF(new File(filepath));
            PDFRenderer renderer = new PDFRenderer(doc);
            int pageCount = doc.getNumberOfPages();
            for (int i = Index; i <pageCount; i++) {
                // 4. 指定页码转换图片
                BufferedImage image = renderer.renderImageWithDPI(i, 144); // Windows native DPI
                String newFileId = TecrunUtils.createFileId();
                String imageName=filename+"_"+i+"."+type;
                String imagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 0);
                String thumbImagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 1);
//                String imagePath=fileFloder+newFileId+"_0";
//                String thumbImagePath=fileFloder+newFileId+"_1.jpg";
                Path dist = Paths.get(imagePath);
                // 5. 存储为指定格式图片
                ImageIO.write(image, "JPEG", dist.toFile());
                ///生成每一页的缩略图
                Thumbnails.of(imagePath).size(200, 200)
                        .outputFormat("jpg").toFile(thumbImagePath);

                JSONObject ofdimage=new JSONObject();
                ofdimage.put("file_id",newFileId);
                ofdimage.put("imagePath",imagePath);
                ofdimage.put("thumbImagePath",thumbImagePath);
                ofdimage.put("file_name",imageName);
                ofdimage.put("file_size", new File(imagePath).length());
                ofdimage.put("file_suffix",type);
                returnObj.add(ofdimage);

            }
        }catch(Exception e){
            e.printStackTrace();
        }
        return returnObj;
    }

    //pdf文件所有也都转图
    public static JSONObject PdfToImageByPageNum(String rootPath, String file_id, String batchId, String filename, String type,int PageNum) {
        // 将pdf装图片 并且自定义图片得格式大小
        filename =filename.substring(0, filename.lastIndexOf("."));
        JSONObject ofdimage=new JSONObject();
        String filepath =TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, file_id, 0);
        PDDocument doc =null;
        PDFRenderer renderer =null;
        BufferedImage image  =null;
        try{
              doc = Loader.loadPDF(new File(filepath));
              renderer = new PDFRenderer(doc);
//            int pageCount = doc.getNumberOfPages();
            // 4. 指定页码转换图片
              image = renderer.renderImageWithDPI(PageNum, 200); // Windows native DPI
            String newFileId = TecrunUtils.createFileId();
            String imageName=filename+"_"+PageNum+"."+type;
            String imagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 0);
            String thumbImagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 1);
//                String imagePath=fileFloder+newFileId+"_0";
//                String thumbImagePath=fileFloder+newFileId+"_1.jpg";
            Path dist = Paths.get(imagePath);
            // 5. 存储为指定格式图片
            ImageIO.write(image, "JPEG", dist.toFile());
            ///生成每一页的缩略图
            Thumbnails.of(imagePath).size(200, 200)
                    .outputFormat("jpg").toFile(thumbImagePath);

            ofdimage.put("file_id",newFileId);
            ofdimage.put("imagePath",imagePath);
            ofdimage.put("thumbImagePath",thumbImagePath);
            ofdimage.put("file_name",imageName);
            ofdimage.put("file_size", new File(imagePath).length());
            ofdimage.put("file_suffix",type);

        }catch(Exception e){
            e.printStackTrace();
        }finally {
             doc =null;
             renderer =null;
             image  =null;
        }
        return ofdimage;
    }

}

4.pdf文件解析

 

import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class PDFKeyWordPosition extends PDFTextStripper {

    private List<String> keywordList;
    private Map<String, List<Position>> positionListMap;

    public PDFKeyWordPosition() throws IOException {
        super();
    }

    // 获取坐标信息
    public Map<String, List<Position>> getCoordinate(List<String> keywordList, PDDocument document) throws IOException {
        super.setSortByPosition(true);
        this.keywordList = keywordList;
        this.positionListMap = new HashMap<>();
        super.setStartPage(1);
        super.setEndPage(1);
        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        super.writeText(document, dummy);
        return positionListMap;
    }

    @Override
    protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
        for (String keyword : keywordList) {
            Integer foundIndex = 0;
            List<Position> positionList = positionListMap.computeIfAbsent(keyword, k -> new ArrayList<>());
            for (int i = 0; i < textPositions.size(); i++) {
                TextPosition textPosition = textPositions.get(i);
                String str = textPosition.getUnicode();
                if (0 < str.length() && str.charAt(0) == keyword.charAt(foundIndex)) {
                    foundIndex++;
                    int count = foundIndex;
                    for (int j = foundIndex; j < keyword.length(); j++) {
                        if (i + j >= textPositions.size()) {
                            break;
                        } else {
                            String s = textPositions.get(i + j).getUnicode();
                            if (0 < s.length() && s.charAt(0) == keyword.charAt(j)) {
                                count++;
                            }
                        }
                    }
                    if (count == keyword.length()) {
                        foundIndex = 0;
                        Position position = new Position();
                        position.setX(textPosition.getX());
                        position.setY(textPosition.getY());
                        positionList.add(position);
                        positionListMap.put(keyword, positionList);
                    }
                }
            }
        }
    }

}

class Position {
    public Position() {
    }

    public Position(float x, float y) {
        super();
        this.x = x;
        this.y = y;
    }

    float x;
    float y;

    /**
     * @return the x
     */
    public float getX() {
        return x;
    }

    /**
     * @param x
     *            the x to set
     */
    public void setX(float x) {
        this.x = x;
    }

    /**
     * @return the y
     */
    public float getY() {
        return y;
    }

    /**
     * @param y
     *            the y to set
     */
    public void setY(float y) {
        this.y = y;
    }

    @Override
    public String toString() {
        return "Position [x=" + x + ", y=" + y + "]";
    }
}
 
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class PDFKeyWordPosition extends PDFTextStripper {

    private List<String> keywordList;
    private Map<String, List<Position>> positionListMap;

    public PDFKeyWordPosition() throws IOException {
        super();
    }

    // 获取坐标信息
    public Map<String, List<Position>> getCoordinate(List<String> keywordList, PDDocument document) throws IOException {
        super.setSortByPosition(true);
        this.keywordList = keywordList;
        this.positionListMap = new HashMap<>();
        super.setStartPage(1);
        super.setEndPage(1);
        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        super.writeText(document, dummy);
        return positionListMap;
    }

    @Override
    protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
        for (String keyword : keywordList) {
            Integer foundIndex = 0;
            List<Position> positionList = positionListMap.computeIfAbsent(keyword, k -> new ArrayList<>());
            for (int i = 0; i < textPositions.size(); i++) {
                TextPosition textPosition = textPositions.get(i);
                String str = textPosition.getUnicode();
                if (0 < str.length() && str.charAt(0) == keyword.charAt(foundIndex)) {
                    foundIndex++;
                    int count = foundIndex;
                    for (int j = foundIndex; j < keyword.length(); j++) {
                        if (i + j >= textPositions.size()) {
                            break;
                        } else {
                            String s = textPositions.get(i + j).getUnicode();
                            if (0 < s.length() && s.charAt(0) == keyword.charAt(j)) {
                                count++;
                            }
                        }
                    }
                    if (count == keyword.length()) {
                        foundIndex = 0;
                        Position position = new Position();
                        position.setX(textPosition.getX());
                        position.setY(textPosition.getY());
                        positionList.add(position);
                        positionListMap.put(keyword, positionList);
                    }
                }
            }
        }
    }

}

class Position {
    public Position() {
    }

    public Position(float x, float y) {
        super();
        this.x = x;
        this.y = y;
    }

    float x;
    float y;

    /**
     * @return the x
     */
    public float getX() {
        return x;
    }

    /**
     * @param x
     *            the x to set
     */
    public void setX(float x) {
        this.x = x;
    }

    /**
     * @return the y
     */
    public float getY() {
        return y;
    }

    /**
     * @param y
     *            the y to set
     */
    public void setY(float y) {
        this.y = y;
    }

    @Override
    public String toString() {
        return "Position [x=" + x + ", y=" + y + "]";
    }
}

import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;

import java.awt.*;
import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.util.*;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 专用于处理电子发票识别的类
 *
 * @author arthurlee
 *
 */
public class PdfInvoiceExtractor {
    /**
     * 解析多页pdf全部
     * @param file
     * @return
     * @throws IOException
     */
    public static List<Invoice> extractList(File file) throws IOException{
        ArrayList<Invoice> invoices = new ArrayList<>();
        PDDocument doc = Loader.loadPDF(file);
        for(int i=1;i<=doc.getNumberOfPages();i++){
            invoices.add(extracts(i,file));
        }
        return invoices;
    }

    /**
     * 解析pdf单页
     * @param pageNum 页码(从1开始)
     * @param file 文件地址
     * @return
     * @throws IOException
     */
    public static Invoice extracts(int pageNum,File file) throws IOException{
        Invoice invoice = new Invoice();
        PDDocument doc = Loader.loadPDF(file);
        PDPage firstPage = doc.getPage(pageNum-1);
        int pageWidth = Math.round(firstPage.getCropBox().getWidth());
        PDFTextStripper textStripper = new PDFTextStripper();
        textStripper.setStartPage(pageNum);
        textStripper.setEndPage(pageNum);
        textStripper.setSortByPosition(true);
        String fullText = textStripper.getText(doc);
        if (firstPage.getRotation() != 0) {
            pageWidth = Math.round(firstPage.getCropBox().getHeight());
        }
        String allText = replace(fullText).replaceAll("(", "(").replaceAll(")", ")").replaceAll("¥", "¥");

        Pattern type = Pattern.compile("(?<p>\\S*)电.发票");
        Matcher matcher1 = type.matcher(allText);
        Pattern typedm = Pattern.compile("(?<p>\\S*)发票代码");
        Matcher matcherdm = typedm.matcher(allText);
        if (matcher1.find() && !matcherdm.find()) {电子发票
            invoice.setTitle(matcher1.group());
            {
                Pattern type00Pattern = Pattern.compile("(?<p>\\S*)通发票");
                Matcher m00 = type00Pattern.matcher(allText);
                if (m00.find()) {
                    invoice.setTitle(m00.group("p")+ "通发票");
                    if (null == invoice.getType()) {
                        invoice.setType("普通发票");
                        invoice.setFormat("302");
                    }
                } else {
                    Pattern type01Pattern = Pattern.compile("(?<p>\\S*)用发票");
                    Matcher m01 = type01Pattern.matcher(allText);
                    if (m01.find()) {
                        invoice.setTitle(m01.group("p")+ "用发票");
                        if (null == invoice.getType()) {
                            invoice.setType("专用发票");
                            invoice.setFormat("301");
                        }
                    }
                }
            }
            {
                String reg = "发票号码:(?<number>\\d{20})|:(?<date>\\d{4}年\\d{2}月\\d{2}日)";
                Pattern pattern = Pattern.compile(reg);
                Matcher matcher = pattern.matcher(allText);
                while (matcher.find()) {
                    if (matcher.group("number") != null) {
                        invoice.setNumber(matcher.group("number"));
                    } else if (matcher.group("date") != null) {
                        invoice.setDate(matcher.group("date"));
                    }
                }
            }
            {
//                String reg = "合计¥?(?<amount>[^ \\f\\n\\r\\t\\v\\*]\\S*)(¥?(?<taxAmount>\\S*)|\\*+)\\s";  ///合计¥245.28¥14.72
                String reg = "合计¥?(?<amount>[^ \\f\\n\\r\\t\\v\\*]\\S*)¥";
                String reg1 = "合计¥[\\s\\S]*?¥(?<taxAmount>\\S*)";
                Pattern pattern = Pattern.compile(reg);
                Matcher matcher = pattern.matcher(allText);
                Pattern pattern1 = Pattern.compile(reg1);
                Matcher matcher2 = pattern1.matcher(allText);
                if (matcher.find()) {
                    try {
                        invoice.setAmount(matcher.group("amount"));
                    } catch (Exception e) {
                    }
                }
                if (matcher2.find()) {
                    try {
                        invoice.setTaxAmount(matcher2.group("taxAmount"));
                    } catch (Exception e) {
                        invoice.setTaxAmount("0");
                    }
                }
            }
            if (null == invoice.getAmount()) {
                String reg = "合\\u0020*计\\u0020*¥?(?<amount>[^ ]*)\\u0020+¥?(?:(?<taxAmount>\\S*)|\\*+)\\s";
                Pattern pattern = Pattern.compile(reg);
                Matcher matcher = pattern.matcher(fullText);
                if (matcher.find()) {
                    try {
                        invoice.setAmount(matcher.group("amount"));
                    } catch (Exception e) {
                        invoice.setAmount("0");
                    }
                    try {
                        invoice.setTaxAmount(matcher.group("taxAmount"));
                    } catch (Exception e) {
                        invoice.setTaxAmount("0");
                    }
                }
            }
            {
                String reg = "价税合计\\u0028大写\\u0029(?<amountString>\\S*)\\u0028小写\\u0029¥?(?<amount>\\S*)\\s";
                Pattern pattern = Pattern.compile(reg);
                Matcher matcher = pattern.matcher(allText);
                if (matcher.find()) {
                    invoice.setTotalAmountString(matcher.group("amountString"));
                    try {
                        invoice.setTotalAmount(matcher.group("amount"));
                    } catch (Exception e) {
                        invoice.setTotalAmount("0");
                    }
                }
                if (StringUtils.isEmpty(invoice.getTotalAmount())) {
                    //
                    String regx = "价税合计\\u0028⼤写\\u0029(?<amountString>\\S*)\\u0028小写\\u0029¥?(?<amount>\\S*)\\s";
                    Pattern patternx = Pattern.compile(regx);
                    Matcher matcherx = patternx.matcher(allText);
                    if (matcherx.find()) {
                        invoice.setTotalAmountString(matcherx.group("amountString"));
                        try {
                            invoice.setTotalAmount(matcherx.group("amount"));
                        } catch (Exception e) {
                            invoice.setTotalAmount("0");
                        }
                    }
                }
            }
            {
                String reg = "收款人:(?<payee>);\\S*";
                String reg1 = "复核:(?<reviewer>\\S*)|复核人:(?<reviewer1>\\S*)";
                String reg2 = "开票人:(?<drawer>\\S*)";
                Pattern pattern = Pattern.compile(reg);
                Matcher matcher = pattern.matcher(allText);
                if (matcher.find()) {
                    invoice.setPayee(matcher.group("payee"));
                }
                Pattern pattern1 = Pattern.compile(reg1);
                Matcher matchere = pattern1.matcher(allText);
                if (matchere.find()) {
                    invoice.setReviewer(TecrunUtils.nullToStr(matchere.group("reviewer")) +TecrunUtils.nullToStr(matchere.group("reviewer1")));
                }
                Pattern pattern2 = Pattern.compile(reg2);
                Matcher matcher2 = pattern2.matcher(allText);
                if (matcher2.find()) {
                    invoice.setDrawer(matcher2.group("drawer"));
                }
                if (allText.indexOf("通行费") > 0 && allText.indexOf("车牌号") > 0) {
                    invoice.setType("通行费");
                }
            }
            {

                PDFKeyWordPosition kwp = new PDFKeyWordPosition();
                Map<String, List<Position>> positionListMap = kwp
                        .getCoordinate(Arrays.asList("机器编号", "税率", "价税合计", "合计", "开票日期","项目名称", "规格型号", "车牌号", "开户行及账号","统一社会信用代码","税额","单位", "密", "码", "区"), doc);

                PDFTextStripperByArea stripper = new PDFTextStripperByArea();
                stripper.setSortByPosition(true);
                PDFTextStripperByArea detailStripper = new PDFTextStripperByArea();
                detailStripper.setSortByPosition(true);
                {
                    Position machineNumber;
                    if (positionListMap.get("机器编号").size() > 0) {
                        machineNumber = positionListMap.get("机器编号").get(0);
                    } else {
                        machineNumber = positionListMap.get("开票日期").get(0);
                        machineNumber.setY(machineNumber.getY() + 30);
                    }
                    Position taxRate = positionListMap.get("税率").get(0);
                    Position taxRateAmount = null;
                    if (CollectionUtils.isNotEmpty(positionListMap.get("税额"))) {
                        taxRateAmount = positionListMap.get("税额").get(0);
                    }
                    Position totalAmount = positionListMap.get("价税合计").get(0);
                    Position amount = positionListMap.get("合计").get(0);
                    Position model = null;
                    if (!positionListMap.get("项目名称").isEmpty()) {
                        model = positionListMap.get("项目名称").get(0);
                    } else {
                        model = positionListMap.get("车牌号").get(0);
                        model.setX(model.getX() - 15);
                    }

                    List<Position> account = positionListMap.get("统一社会信用代码");
                    Position buyer;
                    Position seller;
                    if (account.size() < 2) {
                        buyer = new Position(51, 122);
                        seller = new Position(351, 122);
                    } else {
                        buyer = account.get(0);
                        seller = account.get(1);
                    }

                    /*int maqX = 370;
                    List<Position> mi = positionListMap.get("密");
                    List<Position> ma = positionListMap.get("码");
                    List<Position> qu = positionListMap.get("区");
                    for (int i = 0; i < mi.size(); i++) {
                        float x1 = mi.get(i).getX();
                        for (int j = 0; j < ma.size(); j++) {
                            float x2 = ma.get(j).getX();
                            if (Math.abs(x1 - x2) < 5) {
                                for (int k = 0; k < qu.size(); k++) {
                                    float x3 = qu.get(k).getX();
                                    if (Math.abs(x2 - x3) < 5) {
                                        maqX = Math.round((x1 + x2 + x3) / 3);
                                    }
                                }
                            }
                        }
                    }*/
                    {

                        int x = Math.round(model.getX()+100);

                        int y = Math.round(taxRateAmount!=null?taxRateAmount.getY():0) + 5; // 用税额的y坐标作参考
                        int h = Math.round(amount.getY()) - Math.round(taxRate.getY()) -20; // 价税合计的y坐标减去税率的y坐标
                        detailStripper.addRegion("detail", new Rectangle(0, y, pageWidth, h));
                        stripper.addRegion("detailName", new Rectangle(0, y, x, h));
                        stripper.addRegion("detailPrice", new Rectangle(x, y, pageWidth, h));
                    }
                    {
//                        int x = maqX + 10;
//                        int y = Math.round(machineNumber.getY()) + 10;
//                        int w = pageWidth - maqX - 10;
//                        int h = Math.round(taxRate.getY() - 5) - y;
//                        stripper.addRegion("password", new Rectangle(x, y, w, h));

//                        int x = Math.round(buyer.getX()) - 15; // 开户行及账号的x为参考
//                        int y = Math.round(machineNumber.getY()) + 10; // 机器编号的y坐标为参考
//                        int w = maqX - x - 5; // 密码区x坐标为参考
//                        int h = Math.round(buyer.getY()) - y + 20; // 开户行及账号的y坐标为参考
//                        stripper.addRegion("buyer", new Rectangle(x, y, w, h));

//                       int x = Math.round(seller.getX()) - 15; // 开户行及账号为x参考
//                       int y = Math.round(totalAmount.getY()) + 10; // 价税合计的y坐标为参考
//                       int w = maqX - x - 5; // 密码区的x为参考
//                       int h = Math.round(seller.getY()) - y + 20; // 开户行及账号的y为参考
//                       stripper.addRegion("seller", new Rectangle(x, y, w, h));
                    }
                    {
                        int x = Math.round(buyer.getX()) - 30; // 买方 统一社会信用代码及账号的x为参考
                        int y = Math.round(buyer.getY()) - 60; // 买方 统一社会信用代码及账号的y为参考
                        int w = Math.round(seller.getX() - x - 5); //
                        int h = Math.round(buyer.getY()) - y + 60; // 买方 统一社会信用代码 开户行及账号的y坐标为参考
                        stripper.addRegion("buyer", new Rectangle(x, y, w, h));
                    }
                    {
                        int x = Math.round(seller.getX()) -30; //  购方 统一社会信用代码 开户行及账号为x参考
                        int y = Math.round(seller.getY()) - 60; // 购方 统一社会信用代码 的y坐标为参考
                        int w = pageWidth - x - 5; //
                        int h = Math.round(seller.getY()) - y + 60; // 开户行及账号的y为参考
                        stripper.addRegion("seller", new Rectangle(x, y, w, h));
                    }
                }
                stripper.extractRegions(firstPage);
                detailStripper.extractRegions(firstPage);
                doc.close();
//                invoice.setPassword(StringUtils.trim(stripper.getTextForRegion("password")));

                String reg = "名称:(?<name>\\S*)|纳税人识别号:(?<code>\\S*)|地址、电话:(?<address>\\S*)|开户行及账号:(?<account>\\S*)|电子支付标识:(?<account2>\\S*)|统一社会信用代码[\\s\\S]*?号:(?<account3>\\S*)";
                {
                    String buyer = replace(stripper.getTextForRegion("buyer"));
                    Pattern pattern = Pattern.compile(reg);
                    Matcher matcher = pattern.matcher(buyer);
                    while (matcher.find()) {
                        if (matcher.group("name") != null) {
                            invoice.setBuyerName(matcher.group("name"));
                        } else if (matcher.group("code") != null) {
                            invoice.setBuyerCode(matcher.group("code"));
                        } else if (matcher.group("address") != null) {
                            invoice.setBuyerAddress(matcher.group("address"));
                        } else if (matcher.group("account") != null) {
                            invoice.setBuyerAccount(matcher.group("account"));
                        } else if (matcher.group("account2") != null) {
                            invoice.setBuyerAccount(matcher.group("account2"));
                        }else if (matcher.group("account3") != null) {
                            invoice.setBuyerCode(matcher.group("account3"));
                        }
                    }
                }
                {
                    String seller = replace(stripper.getTextForRegion("seller"));
                    Pattern pattern = Pattern.compile(reg);
                    Matcher matcher = pattern.matcher(seller);
                    while (matcher.find()) {
                        if (matcher.group("name") != null) {
                            invoice.setSellerName(matcher.group("name"));
                        } else if (matcher.group("code") != null) {
                            invoice.setSellerCode(matcher.group("code"));
                        } else if (matcher.group("address") != null) {
                            invoice.setSellerAddress(matcher.group("address"));
                        } else if (matcher.group("account") != null) {
                            invoice.setSellerAccount(matcher.group("account"));
                        }else if (matcher.group("account3") != null) {
                            invoice.setSellerCode(matcher.group("account3"));
                        }
                    }
                }
                {
                    List<String> skipList = new ArrayList<>();
                    List<Detail> detailList = new ArrayList<>();
                    String[] detailPriceStringArray = stripper.getTextForRegion("detailPrice").replaceAll(" ", " ").replaceAll(" ", " ")
                            .replaceAll("\r", "").split("\\n");
                    for (String detailString : detailPriceStringArray) {
                        Detail detail = new Detail();
                        detail.setName("");
                        String[] itemArray = StringUtils.split(detailString, " ");
                        if (2 == itemArray.length) {
                            detail.setAmount(itemArray[0]);
                            detail.setTaxAmount(itemArray[1]);
                            detailList.add(detail);
                        } else if (2 < itemArray.length) {
                            detail.setAmount(itemArray[itemArray.length - 3]);
                            String taxRate = itemArray[itemArray.length - 2];
                            if (taxRate.indexOf("免税") > 0 || taxRate.indexOf("不征税") > 0 || taxRate.indexOf("出口零税率") > 0
                                    || taxRate.indexOf("普通零税率") > 0 || taxRate.indexOf("%") < 0) {
                                detail.setTaxRate("0");
                                detail.setTaxAmount("0");
                            } else {
                                BigDecimal rate = new BigDecimal(Integer.parseInt(taxRate.replaceAll("%", "")));
                                detail.setTaxRate(String.valueOf(rate.divide(new BigDecimal(100))));
                                detail.setTaxAmount( itemArray[itemArray.length - 1]);
                            }
                            for (int j = 0; j < itemArray.length - 3; j++) {
                                if (itemArray[j].matches("^(-?\\d+)(\\.\\d+)?$")) {
                                    if (null == detail.getCount()) {
                                        detail.setCount(itemArray[j]);
                                    } else {
                                        detail.setPrice(itemArray[j]);
                                    }
                                } else {
                                    if (itemArray.length >= j + 1 && !itemArray[j + 1].matches("^(-?\\d+)(\\.\\d+)?$")) {
                                        detail.setUnit(itemArray[j + 1]);
                                        detail.setModel(itemArray[j]);
                                        j++;
                                    } else if (itemArray[j].length() > 2) {
                                        detail.setModel(itemArray[j]);
                                    } else {
                                        detail.setUnit(itemArray[j]);
                                    }
                                }
                            }
                            detailList.add(detail);
                        } else {
                            skipList.add(detailString);
                        }
                    }

                    String[] detailNameStringArray = stripper.getTextForRegion("detailName").replaceAll(" ", " ").replaceAll(" ", " ")
                            .replaceAll("\r", "").split("\\n");
                    String[] detailStringArray = replace(detailStripper.getTextForRegion("detail")).replaceAll("\r", "").split("\\n");
                    int i = 0, j = 0, h = 0, m = 0;
                    Detail lastDetail = null;
                    for (String detailString : detailStringArray) {
                        if (m < detailNameStringArray.length) {
                            if (detailString.matches("\\S+\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*")
                                    && !detailString.matches("^ *\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*")
                                    && detailString.matches("\\S+\\d+%[\\-\\d]+\\S*")
                                    || detailStringArray.length > i + 1
                                    && detailStringArray[i + 1].matches("^ *\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*")) {
                                if (j < detailList.size()) {
                                    lastDetail = detailList.get(j);
                                    lastDetail.setName(detailNameStringArray[m]);
                                }
                                j++;
                            } else if (null != lastDetail && StringUtils.isNotBlank(detailNameStringArray[m])) {
                                if (skipList.size() > h) {
                                    String skip = skipList.get(h);
                                    if (detailString.endsWith(skip)) {
                                        if (detailString.equals(skip)) {
                                            m--;
                                        } else {
                                            lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]);
                                        }
                                        lastDetail.setModel(lastDetail.getModel() + skip);
                                        h++;
                                    } else {
                                        lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]);
                                    }
                                } else {
                                    lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]);
                                }
                            }
                        }
                        i++;
                        m++;
                    }
                    invoice.setDetailList(detailList);
                }
            }




        } else { ///电子普通发票    电子专用发票
            Pattern typetong = Pattern.compile("(?<p>\\S*)通发票");
            Matcher matchertong = typetong.matcher(allText);

            Pattern typeyong = Pattern.compile("(?<p>\\S*)用发票");
            Matcher matcheryong = typeyong.matcher(allText);
            boolean flag=false;
            if(matchertong.find() ){///通发票
                flag=true;
            }
            if(matcheryong.find() ){///用发票
                flag=true;
            }
            if (!flag){
                return invoice;
            }


            {
                String reg = "机器编号:(?<machineNumber>\\d{12})|发票代码:(?<code>\\d{12})|发票号码:(?<number>\\d{8})|:(?<date>\\d{4}年\\d{2}月\\d{2}日)"
                        + "|校验码:(?<checksum>\\d{20}|\\S{4,})";
                Pattern pattern = Pattern.compile(reg);
                Matcher matcher = pattern.matcher(allText);
                while (matcher.find()) {
                    if (matcher.group("machineNumber") != null) {
                        invoice.setMachineNumber(matcher.group("machineNumber"));
                    } else if (matcher.group("code") != null) {
                        invoice.setCode(matcher.group("code"));
                    } else if (matcher.group("number") != null) {
                        invoice.setNumber(matcher.group("number"));
                    } else if (matcher.group("date") != null) {
                        invoice.setDate(matcher.group("date"));
                    } else if (matcher.group("checksum") != null) {
                        invoice.setChecksum(matcher.group("checksum"));
                    }
                }
                if (StringUtils.isBlank(invoice.getDate())) {
                    String kprqGrex = "开票日期:(\\d.*)日";
                    Pattern compile = Pattern.compile(kprqGrex);
                    Matcher matcher2 = compile.matcher(allText);
                    if (matcher2.find()) {
                        try {
                            invoice.setDate(matcher2.group(1));
                        } catch (Exception e) {

                        }
                    }
                }
            }
            {
                String reg = "合计¥?(?<amount>[^ \\f\\n\\r\\t\\v\\*]*)(?:¥?(?<taxAmount>\\S*)|\\*+)\\s";
                Pattern pattern = Pattern.compile(reg);
                Matcher matcher = pattern.matcher(allText);
                if (matcher.find()) {
                    try {
                        invoice.setAmount(matcher.group("amount"));
                    } catch (Exception e) {
                    }
                    try {
                        invoice.setTaxAmount(matcher.group("taxAmount"));
                    } catch (Exception e) {
                        invoice.setTaxAmount("0");
                    }
                }
            }
            try {
                String amountTest = "^\\d+(\\.\\d+)?$";
                Pattern pattern = Pattern.compile(amountTest);
                Matcher matcher = pattern.matcher(invoice.getAmount());
                if (!matcher.find()) {
                    invoice.setAmount(null);
                }
            } catch (Exception e) {
            }
            if (null == invoice.getAmount()) {
                String reg = "合\\u0020*计\\u0020*¥?(?<amount>[^ ]*)\\u0020+¥?(?:(?<taxAmount>\\S*)|\\*+)\\s";
                Pattern pattern = Pattern.compile(reg);
                Matcher matcher = pattern.matcher(fullText);
                if (matcher.find()) {
                    try {
                        invoice.setAmount(matcher.group("amount"));
                        if (StringUtils.isNotBlank(invoice.getAmount())) {
                            String grex = "¥(-?\\d*.?\\d+)";
                            Pattern pattern1 = Pattern.compile(grex);
                            Matcher matcher2 = pattern1.matcher(invoice.getAmount());
                            if (matcher2.find()) {
                                invoice.setAmount(matcher2.group(1));
                            }
                        }
                    } catch (Exception e) {
                        invoice.setAmount("0");
                    }
                    try {
                        invoice.setTaxAmount(matcher.group("taxAmount"));
                        if (StringUtils.isNotBlank(invoice.getTaxAmount())) {
                            String grex = "¥(-?\\d*.?\\d+)";
                            Pattern pattern1 = Pattern.compile(grex);
                            Matcher matcher2 = pattern1.matcher(invoice.getTaxAmount());
                            if (matcher2.find()) {
                                invoice.setTaxAmount(matcher2.group(1));
                            }
                        }
                    } catch (Exception e) {
                        invoice.setTaxAmount("0");
                    }
                }
            }
            {
                String reg = "价税合计\\u0028大写\\u0029(?<amountString>\\S*)\\u0028小写\\u0029¥?(?<amount>\\S*)\\s";
                Pattern pattern = Pattern.compile(reg);
                Matcher matcher = pattern.matcher(allText);
                if (matcher.find()) {
                    invoice.setTotalAmountString(matcher.group("amountString"));
                    try {
                        invoice.setTotalAmount(matcher.group("amount"));
                    } catch (Exception e) {
                        invoice.setTotalAmount("0");
                    }
                }
            }
            {
                String reg = "收款人:(?<payee>\\S*)复核:(?<reviewer>\\S*)开票人:(?<drawer>\\S*)销售方";
                Pattern pattern = Pattern.compile(reg);
                Matcher matcher = pattern.matcher(allText);
                if (matcher.find()) {
                    invoice.setPayee(matcher.group("payee"));
                    invoice.setReviewer(matcher.group("reviewer"));
                    invoice.setDrawer(matcher.group("drawer"));
                }
                if (allText.indexOf("通行费") > 0 && allText.indexOf("车牌号") > 0) {
                    invoice.setType("通行费");
                }
                Pattern type00Pattern = Pattern.compile("(?<p>\\S*)通发票");
                Matcher m00 = type00Pattern.matcher(allText);
                if (m00.find()) {
                    invoice.setTitle(m00.group("p").replaceAll("(?:国|统|一|发|票|监|制)", "") + "通发票");
                    if (null == invoice.getType()) {
                        invoice.setType("普通发票");
                        invoice.setFormat("202");
                    }
                } else {
                    Pattern type01Pattern = Pattern.compile("(?<p>\\S*)用发票");
                    Matcher m01 = type01Pattern.matcher(allText);
                    if (m01.find()) {
                        invoice.setTitle(m01.group("p").replaceAll("(?:国|统|一|发|票|监|制)", "") + "用发票");
                        if (null == invoice.getType()) {
                            invoice.setType("专用发票");
                            invoice.setFormat("201");
                        }
                    }
                }
            }

            PDFKeyWordPosition kwp = new PDFKeyWordPosition();
            Map<String, List<Position>> positionListMap = kwp
                    .getCoordinate(Arrays.asList("机器编号", "税率", "价税合计", "合计", "开票日期", "规格型号", "车牌号", "开户行及账号", "密", "码", "区"), doc);

            PDFTextStripperByArea stripper = new PDFTextStripperByArea();
            stripper.setSortByPosition(true);
            PDFTextStripperByArea detailStripper = new PDFTextStripperByArea();
            detailStripper.setSortByPosition(true);
            {
                Position machineNumber;
                if (positionListMap.get("机器编号").size() > 0) {
                    machineNumber = positionListMap.get("机器编号").get(0);
                } else {
                    machineNumber = positionListMap.get("开票日期").get(0);
                    machineNumber.setY(machineNumber.getY() + 30);
                }
                Position taxRate = positionListMap.get("税率").get(0);
                Position totalAmount = positionListMap.get("价税合计").get(0);
                Position amount = positionListMap.get("合计").get(0);
                Position model = null;
                if (!positionListMap.get("规格型号").isEmpty()) {
                    model = positionListMap.get("规格型号").get(0);
                } else {
                    model = positionListMap.get("车牌号").get(0);
                    model.setX(model.getX() - 15);
                }

                List<Position> account = positionListMap.get("开户行及账号");
                Position buyer;
                Position seller;
                if (account.size() < 2) {
                    buyer = new Position(51, 122);
                    seller = new Position(51, 341);
                } else {
                    buyer = account.get(0);
                    seller = account.get(1);
                }

                int maqX = 370;
                List<Position> mi = positionListMap.get("密");
                List<Position> ma = positionListMap.get("码");
                List<Position> qu = positionListMap.get("区");
                for (int i = 0; i < mi.size(); i++) {
                    float x1 = mi.get(i).getX();
                    for (int j = 0; j < ma.size(); j++) {
                        float x2 = ma.get(j).getX();
                        if (Math.abs(x1 - x2) < 5) {
                            for (int k = 0; k < qu.size(); k++) {
                                float x3 = qu.get(k).getX();
                                if (Math.abs(x2 - x3) < 5) {
                                    maqX = Math.round((x1 + x2 + x3) / 3);
                                }
                            }
                        }
                    }
                }
                {
                    int x = Math.round(model.getX()) - 13;
                    int y = Math.round(taxRate.getY()) + 5; // 用税率的y坐标作参考
                    int h = Math.round(amount.getY()) - Math.round(taxRate.getY()) - 25; // 价税合计的y坐标减去税率的y坐标
                    detailStripper.addRegion("detail", new Rectangle(0, y, pageWidth, h));
                    stripper.addRegion("detailName", new Rectangle(0, y, x, h));
                    stripper.addRegion("detailPrice", new Rectangle(x, y, pageWidth, h));
                }
                {
                    int x = maqX + 10;
                    int y = Math.round(machineNumber.getY()) + 10;
                    int w = pageWidth - maqX - 10;
                    int h = Math.round(taxRate.getY() - 5) - y;
                    stripper.addRegion("password", new Rectangle(x, y, w, h));
                }
                {
                    int x = Math.round(buyer.getX()) - 15; // 开户行及账号的x为参考
                    int y = Math.round(machineNumber.getY()) + 10; // 机器编号的y坐标为参考
                    int w = maqX - x - 5; // 密码区x坐标为参考
                    int h = Math.round(buyer.getY()) - y + 20; // 开户行及账号的y坐标为参考
                    stripper.addRegion("buyer", new Rectangle(x, y, w, h));
                }
                {
                    int x = Math.round(seller.getX()) - 15; // 开户行及账号为x参考
                    int y = Math.round(totalAmount.getY()) + 10; // 价税合计的y坐标为参考
                    int w = maqX - x - 5; // 密码区的x为参考
                    int h = Math.round(seller.getY()) - y + 20; // 开户行及账号的y为参考
                    stripper.addRegion("seller", new Rectangle(x, y, w, h));
                }
            }
            stripper.extractRegions(firstPage);
            detailStripper.extractRegions(firstPage);
            doc.close();

            invoice.setPassword(StringUtils.trim(stripper.getTextForRegion("password")));

            String reg = "名称:(?<name>\\S*)|纳税人识别号:(?<code>\\S*)|地址、电话:(?<address>\\S*)|开户行及账号:(?<account>\\S*)|电子支付标识:(?<account2>\\S*)";
            {
                String buyer = replace(stripper.getTextForRegion("buyer"));
                Pattern pattern = Pattern.compile(reg);
                Matcher matcher = pattern.matcher(buyer);
                while (matcher.find()) {
                    if (matcher.group("name") != null) {
                        invoice.setBuyerName(matcher.group("name"));
                    } else if (matcher.group("code") != null) {
                        invoice.setBuyerCode(matcher.group("code"));
                    } else if (matcher.group("address") != null) {
                        invoice.setBuyerAddress(matcher.group("address"));
                    } else if (matcher.group("account") != null) {
                        invoice.setBuyerAccount(matcher.group("account"));
                    } else if (matcher.group("account2") != null) {
                        invoice.setBuyerAccount(matcher.group("account2"));
                    }
                }
            }
            {
                String seller = replace(stripper.getTextForRegion("seller"));
                Pattern pattern = Pattern.compile(reg);
                Matcher matcher = pattern.matcher(seller);
                while (matcher.find()) {
                    if (matcher.group("name") != null) {
                        invoice.setSellerName(matcher.group("name"));
                    } else if (matcher.group("code") != null) {
                        invoice.setSellerCode(matcher.group("code"));
                    } else if (matcher.group("address") != null) {
                        invoice.setSellerAddress(matcher.group("address"));
                    } else if (matcher.group("account") != null) {
                        invoice.setSellerAccount(matcher.group("account"));
                    }
                }
            }
            {
                List<String> skipList = new ArrayList<>();
                List<Detail> detailList = new ArrayList<>();
                String[] detailPriceStringArray = stripper.getTextForRegion("detailPrice").replaceAll(" ", " ").replaceAll(" ", " ")
                        .replaceAll("\r", "").split("\\n");
                for (String detailString : detailPriceStringArray) {
                    Detail detail = new Detail();
                    detail.setName("");
                    String[] itemArray = StringUtils.split(detailString, " ");
                    if (2 == itemArray.length) {
                        detail.setAmount(itemArray[0]);
                        detail.setTaxAmount(itemArray[1]);
                        detailList.add(detail);
                    } else if (2 < itemArray.length) {
                        detail.setAmount(itemArray[itemArray.length - 3]);
                        String taxRate = itemArray[itemArray.length - 2];
                        if (taxRate.indexOf("免税") > 0 || taxRate.indexOf("不征税") > 0 || taxRate.indexOf("出口零税率") > 0
                                || taxRate.indexOf("普通零税率") > 0 || taxRate.indexOf("%") < 0) {
                            detail.setTaxRate("0");
                            detail.setTaxAmount("0");
                        } else {
                            BigDecimal rate = new BigDecimal(Integer.parseInt(taxRate.replaceAll("%", "")));
                            detail.setTaxRate(String.valueOf(rate.divide(new BigDecimal(100))));
                            detail.setTaxAmount( itemArray[itemArray.length - 1]);
                        }
                        for (int j = 0; j < itemArray.length - 3; j++) {
                            if (itemArray[j].matches("^(-?\\d+)(\\.\\d+)?$")) {
                                if (null == detail.getCount()) {
                                    detail.setCount(itemArray[j]);
                                } else {
                                    detail.setPrice(itemArray[j]);
                                }
                            } else {
                                if (itemArray.length >= j + 1 && !itemArray[j + 1].matches("^(-?\\d+)(\\.\\d+)?$")) {
                                    detail.setUnit(itemArray[j + 1]);
                                    detail.setModel(itemArray[j]);
                                    j++;
                                } else if (itemArray[j].length() > 2) {
                                    detail.setModel(itemArray[j]);
                                } else {
                                    detail.setUnit(itemArray[j]);
                                }
                            }
                        }
                        detailList.add(detail);
                    } else {
                        skipList.add(detailString);
                    }
                }

                String[] detailNameStringArray = stripper.getTextForRegion("detailName").replaceAll(" ", " ").replaceAll(" ", " ")
                        .replaceAll("\r", "").split("\\n");
                String[] detailStringArray = replace(detailStripper.getTextForRegion("detail")).replaceAll("\r", "").split("\\n");
                int i = 0, j = 0, h = 0, m = 0;
                Detail lastDetail = null;
                for (String detailString : detailStringArray) {
                    if (m < detailNameStringArray.length) {
                        if (detailString.matches("\\S+\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*")
                                && !detailString.matches("^ *\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*")
                                && detailString.matches("\\S+\\d+%[\\-\\d]+\\S*")
                                || detailStringArray.length > i + 1
                                && detailStringArray[i + 1].matches("^ *\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*")) {
                            if (j < detailList.size()) {
                                lastDetail = detailList.get(j);
                                lastDetail.setName(detailNameStringArray[m]);
                            }
                            j++;
                        } else if (null != lastDetail && StringUtils.isNotBlank(detailNameStringArray[m])) {
                            if (skipList.size() > h) {
                                String skip = skipList.get(h);
                                if (detailString.endsWith(skip)) {
                                    if (detailString.equals(skip)) {
                                        m--;
                                    } else {
                                        lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]);
                                    }
                                    lastDetail.setModel(lastDetail.getModel() + skip);
                                    h++;
                                } else {
                                    lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]);
                                }
                            } else {
                                lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]);
                            }
                        }
                    }
                    i++;
                    m++;
                }
                invoice.setDetailList(detailList);
            }

        }
        return invoice;
    }

    private static Invoice parseGenerayInvoiceHandle(Invoice invoice, String allText) {
        if (Objects.isNull(invoice) || StringUtils.isBlank(invoice.getNumber())) {
            //执行解析逻辑
            String regx = "^\\d{12}";
            Pattern patternF = Pattern.compile(regx);
            Matcher matcherF = patternF.matcher(allText);
            String regxx = "^\\d{8}";
            Pattern patternn2 = Pattern.compile(regxx);
            Matcher matcherr3 = patternn2.matcher(allText);
            if (matcherF.find() && matcherr3.find()) {
                invoice.setNumber(matcherr3.group());
                invoice.setCode(matcherF.group());

                if (StringUtils.isBlank(invoice.getDate())) {
                    String kprqGrex = "\\d{4}年\\d{2}月\\d{2}日";
                    Pattern compile = Pattern.compile(kprqGrex);
                    Matcher matcher2 = compile.matcher(allText);
                    if (matcher2.find()) {
                        try {
                            invoice.setDate(matcher2.group(1));
                        } catch (Exception e) {

                        }
                    }
                }

                String regJym = "^\\d{32}";
                Pattern pattern = Pattern.compile(regJym);
                Matcher matcher = pattern.matcher(allText);
                if (matcher.find()) {
                    String group = matcher.group();
                    if (StringUtils.isNotBlank(group)) {
                        String substring = group.substring(12);
                        invoice.setChecksum(substring);
                    }
                }
            } else {
                return invoice;
            }
        }
        return invoice;
    }

    /**
     * 解析pdf首页
     * @param file
     * @return
     * @throws IOException
     */
    public static Invoice extract(File file) throws IOException {
        return extracts(1,file);
    }

    public static String replace(String str) {
        return str.replaceAll(" ", "").replaceAll(" ", "").replaceAll(":", ":").replaceAll(" ", "");
    }

    public static void main(String[] args) {
        try {
            File file = new File("D:\\Desktop\\16.1.pdf");
            List<Invoice> invoices = PdfInvoiceExtractor.extractList(file);
            invoices.stream().forEach(e->{
                System.err.println(e);
                System.err.println("---------");
            });
            System.err.println("--------------");
            System.err.println(extracts(1,file));
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值