一、写入Excel表格
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import java.io.FileOutputStream;
import java.io.IOException;
import java.lang.reflect.Field;
import java.util.List;
public class EntityWriteToExcel {
/**
* 设置默认工作表名称
*/
private final static String DEFAULT_SHEET_NAME = "EntityData";
/**
* 设置默认起始行位置
*/
private final static int DEFAULT_ROW_NUMBER = 1;
/**
* 设置默认标题行位置
*/
private final static int DEFAULT_TITLE_ROW_NUMBER = 0;
/**
* 将实体类对象写入Excel表格
*
* @param entities 实体类对象
* @param titles 标题行
* @param saveFilePath 保存路径
*/
public static <T> void writeEntitiesToExcel(List<T> entities, List<String> titles, String saveFilePath) {
try (Workbook workbook = new XSSFWorkbook()) {
Sheet sheet = workbook.createSheet(DEFAULT_SHEET_NAME);
createHeaderRow(sheet, titles);
int rowNum = DEFAULT_ROW_NUMBER;
for (Object entity : entities) {
createDataRow(sheet, entity, rowNum++);
}
writeWorkbookToFile(workbook, saveFilePath);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 创建工作表的标题行
*
* @param sheet 操作的工作表
* @param titles 标题行
*/
private static void createHeaderRow(Sheet sheet, List<String> titles) {
Row headerRow = sheet.createRow(DEFAULT_TITLE_ROW_NUMBER);
for (int i = 0; i < titles.size(); i++) {
headerRow.createCell(i).setCellValue(titles.get(i));
}
}
/**
* 创建工作表的数据行并填充实体数据
*
* @param sheet 操作的工作表
* @param entity 实体对象
* @param rowNum 行号
*/
private static void createDataRow(Sheet sheet, Object entity, int rowNum) {
Row row = sheet.createRow(rowNum);
Field[] fields = entity.getClass().getDeclaredFields();
for (int i = 0; i < fields.length; i++) {
fields[i].setAccessible(true);
try {
Object value = fields[i].get(entity);
if (value != null) {
row.createCell(i).setCellValue(value.toString());
}
} catch (IllegalAccessException e) {
e.printStackTrace();
}
}
}
/**
* 将工作簿的内容写出指定的文件路径
*
* @param workbook 工作簿
* @param saveFilePath 保存路径
*/
private static void writeWorkbookToFile(Workbook workbook, String saveFilePath) {
try (FileOutputStream outputStream = new FileOutputStream(saveFilePath)) {
workbook.write(outputStream);
} catch (IOException e) {
e.printStackTrace();
}
}
}
二、合并提取代码
- 基于前文完成的代码,我们来看看如何调用它们
- 旧版发票信息提取:
/**
* 基于正则的发票信息提取测试方法
*/
@Test
void invoiceExtractorTest() {
// 指定要处理的文件夹路径
//String folderPath = "C:\\Users\\temp\\已用\\pdf";
String folderPath = "C:\\Users\\temp\\已用\\正常识别发票";
//String folderPath = "C:\\Users\\temp\\已用\\部分问题发票";
List<Invoice> invoices = pdfProcessInvoicesInFolder(folderPath); // 获取原始Invoice对象列表
List<InvoiceSubset> invoiceSubsets = new ArrayList<>();
for (Invoice invoice : invoices) {
InvoiceSubset subset = new InvoiceSubset();
//subset.setFileAbsolutePath(invoice.getFileAbsolutePath()); // 保存文件绝对路径,如果不需要则自行注释代码(注意需要同时注释实体类InvoiceSubset)
subset.setCode(invoice.getCode());
subset.setNumber(invoice.getNumber());
subset.setNewNumber(null);
subset.setDate(invoice.getDate());
subset.setChecksum(invoice.getChecksum());
subset.setSellerName(invoice.getSellerName());
subset.setAmount(invoice.getAmount());
subset.setTaxAmount(invoice.getTaxAmount());
subset.setTotalAmount(invoice.getTotalAmount());
invoiceSubsets.add(subset);
}
List<String> titles = new ArrayList<>();
//titles.add("文件绝对路径"); // 保存文件绝对路径,如果不需要则自行注释代码(注意需要同时注释实体类InvoiceSubset)
titles.add("发票代码");
titles.add("发票号码");
titles.add("电子发票号码");
titles.add("开票日期");
titles.add("校验码后六位");
titles.add("开票方名称");
titles.add("金额");
titles.add("税额");
titles.add("合计");
EntityWriteToExcel.writeEntitiesToExcel(invoiceSubsets, titles, "output_error_invoice.xlsx");
printListInvoice(invoices, "string");
}
/**
* 基于正则的新版发票信息提取测试方法
*/
@Test
void newInvoiceExtractorTest() {
// 指定要处理的文件夹路径
String folderPath = "C:\\Users\\temp\\已用\\新发票";
List<NewInvoice> invoices = newPdfProcessInvoicesInFolder(folderPath); // 获取原始Invoice对象列表
List<InvoiceSubset> invoiceSubsets = new ArrayList<>();
for (NewInvoice newInvoice : invoices) {
InvoiceSubset subset = new InvoiceSubset();
//subset.setFileAbsolutePath(newInvoice.getFileAbsolutePath()); // 保存文件绝对路径,如果不需要则自行注释代码(注意需要同时注释实体类InvoiceSubset)
subset.setCode(null);
subset.setNumber(null);
subset.setNewNumber(newInvoice.getNumber());
subset.setDate(newInvoice.getDate());
subset.setChecksum(null);
subset.setSellerName(newInvoice.getSellerName());
subset.setAmount(newInvoice.getAmount());
subset.setTaxAmount(newInvoice.getTaxAmount());
subset.setTotalAmount(newInvoice.getTotalAmount());
invoiceSubsets.add(subset);
}
List<String> titles = new ArrayList<>();
//titles.add("文件绝对路径"); // 保存文件绝对路径,如果不需要则自行注释代码(注意需要同时注释实体类InvoiceSubset)
titles.add("发票代码");
titles.add("发票号码");
titles.add("电子发票号码");
titles.add("开票日期");
titles.add("校验码后六位");
titles.add("开票方名称");
titles.add("金额");
titles.add("税额");
titles.add("合计");
EntityWriteToExcel.writeEntitiesToExcel(invoiceSubsets, titles, "output_new_invoice.xlsx");
newPrintListInvoice(invoices, "string");
}
- ofd格式全电发票,需要先将ofd转换成pdf格式的全电发票,然后再调用旧版发票信息提取代码即可(因为目前ofd还是以旧版全电发票样式为主):
/**
* ofd格式发票转换pdf格式方法
*/
private static void ofdToPdfConverter(String dirPath){
// 获取文件夹中的所有文件
File folder = new File(dirPath);
File[] files = folder.listFiles();
if (files != null){
for (File file : files) {
if (file.isFile() && file.getName().toLowerCase().endsWith(".ofd")){
String absolutePath = file.getAbsolutePath();
Path ofdPath = Paths.get(absolutePath);
Path pdfPath = Paths.get(absolutePath.substring(0, absolutePath.lastIndexOf(".")) + ".pdf");
try (OFDExporter exporter = new PDFExporterPDFBox(ofdPath, pdfPath)) {
exporter.export();
if (file.delete()){ log.warn("OFD文件删除时出现异常~"); }
} catch (IOException e) {
log.error("ofd转换失败~");
}
}
}
}
}
- 完成了上述操作,那么我们可以将它们进行合并,实现一行代码提取发票信息:
import com.xjl.common.utils.excel.util.EntityWriteToExcel;
import com.xjl.common.utils.invoice.extractor.util.NewInvoiceExtractor;
import com.xjl.common.utils.invoice.extractor.util.invoice.extractor.dependent.entity.Invoice;
import com.xjl.common.utils.invoice.extractor.util.invoice.extractor.dependent.entity.InvoiceSubset;
import com.xjl.common.utils.invoice.extractor.util.invoice.extractor.dependent.entity.NewInvoice;
import lombok.extern.slf4j.Slf4j;
import org.ofdrw.converter.export.OFDExporter;
import org.ofdrw.converter.export.PDFExporterPDFBox;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;
import static com.xjl.common.utils.invoice.extractor.util.InvoiceExtractor.pdfProcessInvoicesInFolder;
@Slf4j
public class InvoiceMergeExtraction {
/**
* 提取发票信息
*/
public static List<InvoiceSubset> extractInvoiceInformation(String dirPath, String outputPath) {
ofdToPdfConverter(dirPath);
List<Invoice> invoices = pdfProcessInvoicesInFolder(dirPath);
List<InvoiceSubset> invoiceSubsets = new ArrayList<>();
for (Invoice invoice : invoices) {
InvoiceSubset subset = createInvoiceSubsetFromInvoice(invoice);
if (subset.getCode() == null) {
NewInvoice newInvoice = NewInvoiceExtractor.newPdfProcessInvoicesInFile(invoice.getFileAbsolutePath());
InvoiceSubset newInvoiceSubset = extractCodeFromNewMethod(newInvoice);
if (newInvoiceSubset.getNewNumber() != null) {
subset = newInvoiceSubset;
} else {
log.error("发票信息异常的文件:{}", invoice.getFileAbsolutePath());
continue;
}
}
invoiceSubsets.add(subset);
}
excelFileWriting(invoiceSubsets, outputPath);
return invoiceSubsets;
}
/**
* ofd格式发票转换pdf格式方法
*/
private static void ofdToPdfConverter(String dirPath){
File folder = new File(dirPath);
File[] files = folder.listFiles();
if (files != null){
for (File file : files) {
if (file.isFile() && file.getName().toLowerCase().endsWith(".ofd")){
String absolutePath = file.getAbsolutePath();
Path ofdPath = Paths.get(absolutePath);
Path pdfPath = Paths.get(absolutePath.substring(0, absolutePath.lastIndexOf(".")) + ".pdf");
try (OFDExporter exporter = new PDFExporterPDFBox(ofdPath, pdfPath)) {
exporter.export();
if (file.delete()){ log.warn("OFD文件删除时出现异常~"); }
} catch (IOException e) {
log.error("ofd转换失败~");
}
}
}
}
}
/**
* 旧版发票
*/
private static InvoiceSubset createInvoiceSubsetFromInvoice(Invoice invoice) {
InvoiceSubset subset = new InvoiceSubset();
//subset.setFileAbsolutePath(newInvoice.getFileAbsolutePath()); // 保存文件绝对路径,如果不需要则自行注释代码(注意需要同时注释实体类InvoiceSubset)
subset.setCode(invoice.getCode());
subset.setNumber(invoice.getNumber());
subset.setNewNumber(null);
subset.setDate(invoice.getDate());
subset.setChecksum(invoice.getChecksum());
subset.setSellerName(invoice.getSellerName());
subset.setAmount(invoice.getAmount());
subset.setTaxAmount(invoice.getTaxAmount());
subset.setTotalAmount(invoice.getTotalAmount());
return subset;
}
/**
* 新版发票
*/
private static InvoiceSubset extractCodeFromNewMethod(NewInvoice newInvoice) {
InvoiceSubset subset = new InvoiceSubset();
//subset.setFileAbsolutePath(newInvoice.getFileAbsolutePath()); // 保存文件绝对路径,如果不需要则自行注释代码(注意需要同时注释实体类InvoiceSubset)
subset.setCode(null);
subset.setNumber(null);
subset.setNewNumber(newInvoice.getNumber());
subset.setDate(newInvoice.getDate());
subset.setChecksum(null);
subset.setSellerName(newInvoice.getSellerName());
subset.setAmount(newInvoice.getAmount());
subset.setTaxAmount(newInvoice.getTaxAmount());
subset.setTotalAmount(newInvoice.getTotalAmount());
return subset;
}
/**
* 写入Excel
*/
private static void excelFileWriting(List<InvoiceSubset> invoiceSubsets, String outputPath){
List<String> titles = new ArrayList<>();
//titles.add("文件绝对路径"); // 保存文件绝对路径,如果不需要则自行注释代码(注意需要同时注释实体类InvoiceSubset)
titles.add("发票代码");
titles.add("发票号码");
titles.add("电子发票号码");
titles.add("开票日期");
titles.add("校验码后六位");
titles.add("开票方名称");
titles.add("金额");
titles.add("税额");
titles.add("合计");
try {
EntityWriteToExcel.writeEntitiesToExcel(invoiceSubsets, titles, outputPath);
} catch (Exception e) {
log.error("写入excel文件时出现错误");
}
}
}
extractInvoiceInformation("C:\\Userstemp\\已用\\所有类型发票", "output_test_invoice.xlsx");