Java实现电子发票关键信息提取(二)

一、写入Excel表格

  • EntityWriteToExcel工具类:
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;

import java.io.FileOutputStream;
import java.io.IOException;
import java.lang.reflect.Field;
import java.util.List;


public class EntityWriteToExcel {

    /**
     * 设置默认工作表名称
     */
    private final static String DEFAULT_SHEET_NAME = "EntityData";

    /**
     * 设置默认起始行位置
     */
    private final static int DEFAULT_ROW_NUMBER = 1;


    /**
     * 设置默认标题行位置
     */
    private final static int DEFAULT_TITLE_ROW_NUMBER = 0;


    /**
     * 将实体类对象写入Excel表格
     *
     * @param entities     实体类对象
     * @param titles       标题行
     * @param saveFilePath 保存路径
     */
    public static <T> void writeEntitiesToExcel(List<T> entities, List<String> titles, String saveFilePath) {
        try (Workbook workbook = new XSSFWorkbook()) {
            Sheet sheet = workbook.createSheet(DEFAULT_SHEET_NAME);

            createHeaderRow(sheet, titles);
            int rowNum = DEFAULT_ROW_NUMBER;

            for (Object entity : entities) {
                createDataRow(sheet, entity, rowNum++);
            }

            writeWorkbookToFile(workbook, saveFilePath);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }


    /**
     * 创建工作表的标题行
     *
     * @param sheet  操作的工作表
     * @param titles 标题行
     */
    private static void createHeaderRow(Sheet sheet, List<String> titles) {
        Row headerRow = sheet.createRow(DEFAULT_TITLE_ROW_NUMBER);
        for (int i = 0; i < titles.size(); i++) {
            headerRow.createCell(i).setCellValue(titles.get(i));
        }
    }


    /**
     * 创建工作表的数据行并填充实体数据
     *
     * @param sheet  操作的工作表
     * @param entity 实体对象
     * @param rowNum 行号
     */
    private static void createDataRow(Sheet sheet, Object entity, int rowNum) {
        Row row = sheet.createRow(rowNum);

        Field[] fields = entity.getClass().getDeclaredFields();
        for (int i = 0; i < fields.length; i++) {
            fields[i].setAccessible(true);
            try {
                Object value = fields[i].get(entity);
                if (value != null) {
                    row.createCell(i).setCellValue(value.toString());
                }
            } catch (IllegalAccessException e) {
                e.printStackTrace();
            }
        }
    }


    /**
     * 将工作簿的内容写出指定的文件路径
     *
     * @param workbook     工作簿
     * @param saveFilePath 保存路径
     */
    private static void writeWorkbookToFile(Workbook workbook, String saveFilePath) {
        try (FileOutputStream outputStream = new FileOutputStream(saveFilePath)) {
            workbook.write(outputStream);
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

二、合并提取代码

  • 基于前文完成的代码,我们来看看如何调用它们
  • 旧版发票信息提取:
/**
     * 基于正则的发票信息提取测试方法
     */
    @Test
    void invoiceExtractorTest() {
        // 指定要处理的文件夹路径
        //String folderPath = "C:\\Users\\temp\\已用\\pdf";
        String folderPath = "C:\\Users\\temp\\已用\\正常识别发票";
        //String folderPath = "C:\\Users\\temp\\已用\\部分问题发票";

        List<Invoice> invoices = pdfProcessInvoicesInFolder(folderPath); // 获取原始Invoice对象列表
        List<InvoiceSubset> invoiceSubsets = new ArrayList<>();

        for (Invoice invoice : invoices) {
            InvoiceSubset subset = new InvoiceSubset();
            //subset.setFileAbsolutePath(invoice.getFileAbsolutePath());  // 保存文件绝对路径,如果不需要则自行注释代码(注意需要同时注释实体类InvoiceSubset)
            subset.setCode(invoice.getCode());
            subset.setNumber(invoice.getNumber());
            subset.setNewNumber(null);
            subset.setDate(invoice.getDate());
            subset.setChecksum(invoice.getChecksum());
            subset.setSellerName(invoice.getSellerName());
            subset.setAmount(invoice.getAmount());
            subset.setTaxAmount(invoice.getTaxAmount());
            subset.setTotalAmount(invoice.getTotalAmount());
            invoiceSubsets.add(subset);
        }

        List<String> titles = new ArrayList<>();
        //titles.add("文件绝对路径");  // 保存文件绝对路径,如果不需要则自行注释代码(注意需要同时注释实体类InvoiceSubset)
        titles.add("发票代码");
        titles.add("发票号码");
        titles.add("电子发票号码");
        titles.add("开票日期");
        titles.add("校验码后六位");
        titles.add("开票方名称");
        titles.add("金额");
        titles.add("税额");
        titles.add("合计");

        EntityWriteToExcel.writeEntitiesToExcel(invoiceSubsets, titles, "output_error_invoice.xlsx");
        printListInvoice(invoices, "string");
    }
  • 新版发票信息提取: 
/**
     * 基于正则的新版发票信息提取测试方法
     */
    @Test
    void newInvoiceExtractorTest() {
        // 指定要处理的文件夹路径
        String folderPath = "C:\\Users\\temp\\已用\\新发票";

        List<NewInvoice> invoices = newPdfProcessInvoicesInFolder(folderPath); // 获取原始Invoice对象列表
        List<InvoiceSubset> invoiceSubsets = new ArrayList<>();

        for (NewInvoice newInvoice : invoices) {
            InvoiceSubset subset = new InvoiceSubset();
            //subset.setFileAbsolutePath(newInvoice.getFileAbsolutePath());  // 保存文件绝对路径,如果不需要则自行注释代码(注意需要同时注释实体类InvoiceSubset)
            subset.setCode(null);
            subset.setNumber(null);
            subset.setNewNumber(newInvoice.getNumber());
            subset.setDate(newInvoice.getDate());
            subset.setChecksum(null);
            subset.setSellerName(newInvoice.getSellerName());
            subset.setAmount(newInvoice.getAmount());
            subset.setTaxAmount(newInvoice.getTaxAmount());
            subset.setTotalAmount(newInvoice.getTotalAmount());
            invoiceSubsets.add(subset);
        }

        List<String> titles = new ArrayList<>();
        //titles.add("文件绝对路径");  // 保存文件绝对路径,如果不需要则自行注释代码(注意需要同时注释实体类InvoiceSubset)
        titles.add("发票代码");
        titles.add("发票号码");
        titles.add("电子发票号码");
        titles.add("开票日期");
        titles.add("校验码后六位");
        titles.add("开票方名称");
        titles.add("金额");
        titles.add("税额");
        titles.add("合计");

        EntityWriteToExcel.writeEntitiesToExcel(invoiceSubsets, titles, "output_new_invoice.xlsx");
        newPrintListInvoice(invoices, "string");
    }
  • ofd格式全电发票,需要先将ofd转换成pdf格式的全电发票,然后再调用旧版发票信息提取代码即可(因为目前ofd还是以旧版全电发票样式为主):
   /**
     * ofd格式发票转换pdf格式方法
     */
    private static void ofdToPdfConverter(String dirPath){
        // 获取文件夹中的所有文件
        File folder = new File(dirPath);
        File[] files = folder.listFiles();

        if (files != null){
            for (File file : files) {
                if (file.isFile() && file.getName().toLowerCase().endsWith(".ofd")){
                    String absolutePath = file.getAbsolutePath();

                    Path ofdPath = Paths.get(absolutePath);
                    Path pdfPath = Paths.get(absolutePath.substring(0, absolutePath.lastIndexOf(".")) + ".pdf");

                    try (OFDExporter exporter = new PDFExporterPDFBox(ofdPath, pdfPath)) {
                        exporter.export();
                        if (file.delete()){ log.warn("OFD文件删除时出现异常~"); }
                    } catch (IOException e) {
                        log.error("ofd转换失败~");
                    }
                }
            }
        }
    }
  •  完成了上述操作,那么我们可以将它们进行合并,实现一行代码提取发票信息:
import com.xjl.common.utils.excel.util.EntityWriteToExcel;
import com.xjl.common.utils.invoice.extractor.util.NewInvoiceExtractor;
import com.xjl.common.utils.invoice.extractor.util.invoice.extractor.dependent.entity.Invoice;
import com.xjl.common.utils.invoice.extractor.util.invoice.extractor.dependent.entity.InvoiceSubset;
import com.xjl.common.utils.invoice.extractor.util.invoice.extractor.dependent.entity.NewInvoice;
import lombok.extern.slf4j.Slf4j;
import org.ofdrw.converter.export.OFDExporter;
import org.ofdrw.converter.export.PDFExporterPDFBox;

import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;

import static com.xjl.common.utils.invoice.extractor.util.InvoiceExtractor.pdfProcessInvoicesInFolder;

@Slf4j
public class InvoiceMergeExtraction {
    /**
     * 提取发票信息
     */
    public static List<InvoiceSubset> extractInvoiceInformation(String dirPath, String outputPath) {
        ofdToPdfConverter(dirPath);

        List<Invoice> invoices = pdfProcessInvoicesInFolder(dirPath);
        List<InvoiceSubset> invoiceSubsets = new ArrayList<>();

        for (Invoice invoice : invoices) {
            InvoiceSubset subset = createInvoiceSubsetFromInvoice(invoice);
            if (subset.getCode() == null) {
                NewInvoice newInvoice = NewInvoiceExtractor.newPdfProcessInvoicesInFile(invoice.getFileAbsolutePath());
                InvoiceSubset newInvoiceSubset = extractCodeFromNewMethod(newInvoice);
                if (newInvoiceSubset.getNewNumber() != null) {
                    subset = newInvoiceSubset;
                } else {
                    log.error("发票信息异常的文件:{}", invoice.getFileAbsolutePath());
                    continue;
                }
            }
            invoiceSubsets.add(subset);
        }

        excelFileWriting(invoiceSubsets, outputPath);

        return invoiceSubsets;
    }


    /**
     * ofd格式发票转换pdf格式方法
     */
    private static void ofdToPdfConverter(String dirPath){
        File folder = new File(dirPath);
        File[] files = folder.listFiles();

        if (files != null){
            for (File file : files) {
                if (file.isFile() && file.getName().toLowerCase().endsWith(".ofd")){
                    String absolutePath = file.getAbsolutePath();

                    Path ofdPath = Paths.get(absolutePath);
                    Path pdfPath = Paths.get(absolutePath.substring(0, absolutePath.lastIndexOf(".")) + ".pdf");

                    try (OFDExporter exporter = new PDFExporterPDFBox(ofdPath, pdfPath)) {
                        exporter.export();
                        if (file.delete()){ log.warn("OFD文件删除时出现异常~"); }
                    } catch (IOException e) {
                        log.error("ofd转换失败~");
                    }
                }
            }
        }
    }


    /**
     * 旧版发票
     */
    private static InvoiceSubset createInvoiceSubsetFromInvoice(Invoice invoice) {
        InvoiceSubset subset = new InvoiceSubset();

        //subset.setFileAbsolutePath(newInvoice.getFileAbsolutePath());  // 保存文件绝对路径,如果不需要则自行注释代码(注意需要同时注释实体类InvoiceSubset)
        subset.setCode(invoice.getCode());
        subset.setNumber(invoice.getNumber());
        subset.setNewNumber(null);
        subset.setDate(invoice.getDate());
        subset.setChecksum(invoice.getChecksum());
        subset.setSellerName(invoice.getSellerName());
        subset.setAmount(invoice.getAmount());
        subset.setTaxAmount(invoice.getTaxAmount());
        subset.setTotalAmount(invoice.getTotalAmount());
        return subset;
    }


    /**
     * 新版发票
     */
    private static InvoiceSubset extractCodeFromNewMethod(NewInvoice newInvoice) {
        InvoiceSubset subset = new InvoiceSubset();

        //subset.setFileAbsolutePath(newInvoice.getFileAbsolutePath());  // 保存文件绝对路径,如果不需要则自行注释代码(注意需要同时注释实体类InvoiceSubset)
        subset.setCode(null);
        subset.setNumber(null);
        subset.setNewNumber(newInvoice.getNumber());
        subset.setDate(newInvoice.getDate());
        subset.setChecksum(null);
        subset.setSellerName(newInvoice.getSellerName());
        subset.setAmount(newInvoice.getAmount());
        subset.setTaxAmount(newInvoice.getTaxAmount());
        subset.setTotalAmount(newInvoice.getTotalAmount());
        return subset;
    }


    /**
     * 写入Excel
     */
    private static void excelFileWriting(List<InvoiceSubset> invoiceSubsets, String outputPath){
        List<String> titles = new ArrayList<>();
        //titles.add("文件绝对路径");  // 保存文件绝对路径,如果不需要则自行注释代码(注意需要同时注释实体类InvoiceSubset)
        titles.add("发票代码");
        titles.add("发票号码");
        titles.add("电子发票号码");
        titles.add("开票日期");
        titles.add("校验码后六位");
        titles.add("开票方名称");
        titles.add("金额");
        titles.add("税额");
        titles.add("合计");

        try {
            EntityWriteToExcel.writeEntitiesToExcel(invoiceSubsets, titles, outputPath);
        } catch (Exception e) {
            log.error("写入excel文件时出现错误");
        }
    }

}
  • 一行代码实现发票信息提取:
extractInvoiceInformation("C:\\Userstemp\\已用\\所有类型发票", "output_test_invoice.xlsx");

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

这小鱼在乎

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值