1.之前写pdf文件解析的做过一些pdf发票的解析,但是pdf的发票的类型比较多,来源也很多,有正常的,有图片转pdf的,有其他类型文件转换的,在这里记一下之前开发的一些代码片段 2.导入pom.xml <dependency> <groupId>org.apache.pdfbox</groupId> <artifactId>pdfbox</artifactId> <version>3.0.2</version> </dependency> <dependency> <groupId>com.itextpdf</groupId> <artifactId>itext7-core</artifactId> <version>7.1.1</version> <type>pom</type> </dependency>
3.pdf和图片的相互操作
import com.alibaba.fastjson.JSONObject;
import com.itextpdf.text.Document;
import com.itextpdf.text.DocumentException;
import com.itextpdf.text.Image;
import com.itextpdf.text.PageSize;
import com.itextpdf.text.pdf.PdfWriter;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.PDPageContentStream;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.pdmodel.graphics.image.LosslessFactory;
import org.apache.pdfbox.pdmodel.graphics.image.PDImageXObject;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
/**
* @description: 图片转PDF
**/
public class ImageToPdf {
public static void main(String[] args) {
imageToPdf("E:\\Desktop\\pdf\\bb.pdf",
"E:\\Desktop\\99.jpg,E:\\Desktop\\3_2.jpg,E:\\Desktop\\6.jpg,E:\\Desktop\\测试2.jpg");
System.out.println("200!ok");
}
/**
* @description: 图片转pdf
* @author: wanJh
* @date: 2022-9-19 15:56
* @param: [文件路径, 图片路径,多图片已“,”分隔]
* @return: void
**/
public static JSONObject imageToPdf(String filepath, String imgUrl) {
JSONObject returnResult = new JSONObject();
try {
//图片list集合
ArrayList<String> imageUrllist = new ArrayList<String>();
String[] imgUrls = imgUrl.split(",");
for (int i = 0; i < imgUrls.length; i++) {
imageUrllist.add(imgUrls[i]);
}
//输出pdf文件路径
String pdfUrl = filepath;
//生成pdf
File file = PdfBox(imageUrllist, pdfUrl);
String fileSize = String.valueOf(file.length());
file.createNewFile();
returnResult.put("code", ResultCode.SUCCESS.getCode());
returnResult.put("msg", "PDF合成成功");
returnResult.put("fileSize", fileSize);
} catch (IOException e) {
returnResult.put("code", ResultCode.ERROR.getCode());
returnResult.put("msg", "PDF合成失败");
e.printStackTrace();
}
return returnResult;
}
/**
* 多图片合成PDF 使用pdfbox中的画笔
*
* @param imageUrllist
* @param mOutputPdfFileName
* @return
*/
public static File PdfBox(ArrayList<String> imageUrllist, String mOutputPdfFileName) {
try {
PDDocument document = new PDDocument();
//循环图片List,将图片加入到pdf中
for (int i = 0; i < imageUrllist.size(); i++) {
BufferedImage imageStream = ImageIO.read(new File(imageUrllist.get(i)));
int width = imageStream.getWidth();
int height = imageStream.getHeight();
PDPage page = new PDPage(new PDRectangle(width, height));
document.addPage(page);
PDImageXObject image = LosslessFactory.createFromImage(document, imageStream);
PDPageContentStream contentStream = new PDPageContentStream(document, page,PDPageContentStream.AppendMode.APPEND, false);
contentStream.drawImage(image, 0, 0, width, height);
contentStream.close();
}
document.save(mOutputPdfFileName);
document.close();
} catch (Exception e) {
e.printStackTrace();
}
//输出流
File mOutputPdfFile = new File(mOutputPdfFileName);
if (!mOutputPdfFile.exists()) {
mOutputPdfFile.deleteOnExit();
return null;
}
//返回文件输出流
return mOutputPdfFile;
}
/**
* 多图片合成PDF
*
* @param imageUrllist
* @param mOutputPdfFileName
* @return
*/
public static File Pdf(ArrayList<String> imageUrllist, String mOutputPdfFileName) {
//new一个pdf文档
Document doc = new Document(PageSize.A4, 0, 0, 0, 0);
try {
//pdf写入
PdfWriter.getInstance(doc, new FileOutputStream(mOutputPdfFileName));
//打开文档
doc.open();
//循环图片List,将图片加入到pdf中
for (int i = 0; i < imageUrllist.size(); i++) {
//在pdf创建一页
if (i == 0)
doc.newPage();
//通过文件路径获取image
Image png1 = Image.getInstance(imageUrllist.get(i));
float heigth = png1.getHeight();
float width = png1.getWidth();
int percent = getPercent(heigth, width);
png1.setAlignment(Image.MIDDLE);
// 表示是原来图像的比例;
png1.scalePercent(percent + 3);
doc.add(png1);
}
doc.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (DocumentException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
//输出流
File mOutputPdfFile = new File(mOutputPdfFileName);
if (!mOutputPdfFile.exists()) {
mOutputPdfFile.deleteOnExit();
return null;
}
//反回文件输出流
return mOutputPdfFile;
}
public static int getPercent(float h, float w) {
int p = 0;
float p2 = 0.0f;
if (h > w) {
p2 = 600 / h * 100;
} else {
p2 = 530 / w * 100;
}
p = Math.round(p2);
return p;
}
public static int getPercent2(float h, float w) {
int p = 0;
float p2 = 0.0f;
p2 = 530 / w * 100;
p = Math.round(p2);
return p;
}
}
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import net.coobird.thumbnailator.Thumbnails;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.rendering.PDFRenderer;
import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
public class PdfToImage {
public static void main(String[] args) throws IOException {
String a="1.AAAA.pdf";
String b=a.substring(0, a.lastIndexOf("."));
System.out.println(b);
}
/**
* @description: 把pdf文件中的所有图片转成图片,提取里面的每一张图片
* @author: wanJh
* @date: 2022-9-19 16:06
* @param: [fileAddress, filename, indexOfStart, imagetype]
* @return: com.alibaba.fastjson.JSONArray
**/
/* public static JSONArray pdfToImage(String fileFloder, String file_id, String filename, int indexOfStart, String type) throws IOException {
filename =filename.substring(0, filename.lastIndexOf("."));
JSONArray returnObj = new JSONArray();
PdfDocument pdf = new PdfDocument();
pdf.loadFromFile(fileFloder+file_id+"_0");
// 按每页获取文件内容
PdfPageCollection pages = pdf.getPages();
// pdf每页对象
PdfPageBase page;
int j=1;
if (pages.getCount() > 0) {
for (int i = 0; i < pages.getCount(); i++) {
page = pages.get(i);
BufferedImage[] bufferedImages = page.extractImages(true);
if (ObjectUtils.isNotEmpty(bufferedImages)) {
for (BufferedImage bufferedImage : bufferedImages) {
String imageName=filename+"_"+j+"."+type;
String newFileId = TecrunUtils.createFileId();
String imagePath=fileFloder+newFileId+"_0";
String thumbImagePath=fileFloder+newFileId+"_1.jpg";
ImageIO.write(bufferedImage, type, new File(imagePath));
ImageIO.write(bufferedImage, type, new File(thumbImagePath));
JSONObject imageObj =new JSONObject();
imageObj.put("fileName",imageName);
imageObj.put("filePath",imagePath);
imageObj.put("fileId",newFileId);
returnObj.add(imageObj);
j++;
}
}
}
}
return returnObj;
}*/
/**
* @description: 把pdf按页转成图片
* @author: wanJh
* @date: 2022-9-19 16:22
* @param: [fileAddress, filename, indexOfStart, imagetype]
* @return: void
**/
public static JSONArray pdfToImageByPage(String fileFloder, String file_id, String filename, int indexOfStart, String type) {
// 将pdf装图片 并且自定义图片得格式大小
filename =filename.substring(0, filename.lastIndexOf("."));
JSONArray returnObj = new JSONArray();
File file = new File(fileFloder+file_id+"_0");
try {
PDDocument doc = Loader.loadPDF(file);
PDFRenderer renderer = new PDFRenderer(doc);
int j=1;
int pageCount = doc.getNumberOfPages();
for (int i = indexOfStart; i < pageCount; i++) {
BufferedImage image = renderer.renderImageWithDPI(i, 144); // Windows native DPI
//BufferedImage thumbImage = resize(image, 240, 240);//产生缩略图
String imageName=filename+"_"+j+"."+type;
String newFileId =TecrunUtils.createFileId();
String imagePath=fileFloder+newFileId+"_0";
String thumbImagePath=fileFloder+newFileId+"_1.jpg";
ImageIO.write(image, type, new File(imagePath));
ThumbUtil.zoom(imagePath, thumbImagePath, "png");
//ImageIO.write(image, type, new File(thumbImagePath));
JSONObject imageObj =new JSONObject();
imageObj.put("imageName",imageName);
imageObj.put("imagePath",imagePath);
imageObj.put("fileId",newFileId);
imageObj.put("fileSize","");
returnObj.add(imageObj);
j++;
}
} catch (IOException e) {
e.printStackTrace();
}
return returnObj;
}
//pdf文件所有也都转图
public static JSONArray PdfToImageByPage(String rootPath, String file_id, String batchId, String filename, String type) {
// 将pdf装图片 并且自定义图片得格式大小
filename =filename.substring(0, filename.lastIndexOf("."));
JSONArray returnObj = new JSONArray();
String filepath =TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, file_id, 0);
try{
PDDocument doc = Loader.loadPDF(new File(filepath));
PDFRenderer renderer = new PDFRenderer(doc);
int pageCount = doc.getNumberOfPages();
for (int i = 0; i < pageCount; i++) {
// 4. 指定页码转换图片
BufferedImage image = renderer.renderImageWithDPI(i, 144); // Windows native DPI
String newFileId = TecrunUtils.createFileId();
String imageName=filename+"_"+i+"."+type;
String imagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 0);
String thumbImagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 1);
// String imagePath=fileFloder+newFileId+"_0";
// String thumbImagePath=fileFloder+newFileId+"_1.jpg";
Path dist = Paths.get(imagePath);
// 5. 存储为指定格式图片
ImageIO.write(image, "JPEG", dist.toFile());
///生成每一页的缩略图
Thumbnails.of(imagePath).size(200, 200)
.outputFormat("jpg").toFile(thumbImagePath);
JSONObject ofdimage=new JSONObject();
ofdimage.put("file_id",newFileId);
ofdimage.put("imagePath",imagePath);
ofdimage.put("thumbImagePath",thumbImagePath);
ofdimage.put("file_name",imageName);
ofdimage.put("file_size", new File(imagePath).length());
ofdimage.put("file_suffix",type);
returnObj.add(ofdimage);
}
}catch(Exception e){
e.printStackTrace();
}
return returnObj;
}
//pdf文件第一页转图
public static JSONObject PdfToImageByPageOne(String rootPath, String file_id, String batchId, String filename, String type) {
// 将pdf装图片 并且自定义图片得格式大小
filename =filename.substring(0, filename.lastIndexOf("."));
JSONObject ofdimage=new JSONObject();
// JSONArray returnObj = new JSONArray();
String filepath =TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, file_id, 0);
try{
PDDocument doc = Loader.loadPDF(new File(filepath));
PDFRenderer renderer = new PDFRenderer(doc);
int pageCount = doc.getNumberOfPages();
BufferedImage image = renderer.renderImageWithDPI(0, 144); // Windows native DPI
// 4. 指定页码转换图片
String newFileId = TecrunUtils.createFileId();
String imageName=filename+"_"+0+"."+type;
String imagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 0);
String thumbImagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 1);
// String imagePath=fileFloder+newFileId+"_0";
// String thumbImagePath=fileFloder+newFileId+"_1.jpg";
Path dist = Paths.get(imagePath);
// 5. 存储为指定格式图片
ImageIO.write(image, "JPEG", dist.toFile());
///生成每一页的缩略图
Thumbnails.of(imagePath).size(200, 200)
.outputFormat("jpg").toFile(thumbImagePath);
ofdimage.put("file_id",newFileId);
ofdimage.put("imagePath",imagePath);
ofdimage.put("thumbImagePath",thumbImagePath);
ofdimage.put("file_name",imageName);
ofdimage.put("file_size", new File(imagePath).length());
ofdimage.put("file_suffix",type);
}catch(Exception e){
e.printStackTrace();
}
return ofdimage;
}
//pdf文件从那一页转图
public static JSONArray pdfToImageByPageByIndex(String rootPath, String file_id, String batchId, String filename, String type,int Index) {
// 将pdf装图片 并且自定义图片得格式大小
filename =filename.substring(0, filename.lastIndexOf("."));
JSONArray returnObj = new JSONArray();
String filepath =TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, file_id, 0);
try{
PDDocument doc = Loader.loadPDF(new File(filepath));
PDFRenderer renderer = new PDFRenderer(doc);
int pageCount = doc.getNumberOfPages();
for (int i = Index; i <pageCount; i++) {
// 4. 指定页码转换图片
BufferedImage image = renderer.renderImageWithDPI(i, 144); // Windows native DPI
String newFileId = TecrunUtils.createFileId();
String imageName=filename+"_"+i+"."+type;
String imagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 0);
String thumbImagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 1);
// String imagePath=fileFloder+newFileId+"_0";
// String thumbImagePath=fileFloder+newFileId+"_1.jpg";
Path dist = Paths.get(imagePath);
// 5. 存储为指定格式图片
ImageIO.write(image, "JPEG", dist.toFile());
///生成每一页的缩略图
Thumbnails.of(imagePath).size(200, 200)
.outputFormat("jpg").toFile(thumbImagePath);
JSONObject ofdimage=new JSONObject();
ofdimage.put("file_id",newFileId);
ofdimage.put("imagePath",imagePath);
ofdimage.put("thumbImagePath",thumbImagePath);
ofdimage.put("file_name",imageName);
ofdimage.put("file_size", new File(imagePath).length());
ofdimage.put("file_suffix",type);
returnObj.add(ofdimage);
}
}catch(Exception e){
e.printStackTrace();
}
return returnObj;
}
//pdf文件所有也都转图
public static JSONObject PdfToImageByPageNum(String rootPath, String file_id, String batchId, String filename, String type,int PageNum) {
// 将pdf装图片 并且自定义图片得格式大小
filename =filename.substring(0, filename.lastIndexOf("."));
JSONObject ofdimage=new JSONObject();
String filepath =TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, file_id, 0);
PDDocument doc =null;
PDFRenderer renderer =null;
BufferedImage image =null;
try{
doc = Loader.loadPDF(new File(filepath));
renderer = new PDFRenderer(doc);
// int pageCount = doc.getNumberOfPages();
// 4. 指定页码转换图片
image = renderer.renderImageWithDPI(PageNum, 200); // Windows native DPI
String newFileId = TecrunUtils.createFileId();
String imageName=filename+"_"+PageNum+"."+type;
String imagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 0);
String thumbImagePath = TecrunUtils.getFilePathFromBatchIdAndFileId(rootPath, batchId, newFileId, 1);
// String imagePath=fileFloder+newFileId+"_0";
// String thumbImagePath=fileFloder+newFileId+"_1.jpg";
Path dist = Paths.get(imagePath);
// 5. 存储为指定格式图片
ImageIO.write(image, "JPEG", dist.toFile());
///生成每一页的缩略图
Thumbnails.of(imagePath).size(200, 200)
.outputFormat("jpg").toFile(thumbImagePath);
ofdimage.put("file_id",newFileId);
ofdimage.put("imagePath",imagePath);
ofdimage.put("thumbImagePath",thumbImagePath);
ofdimage.put("file_name",imageName);
ofdimage.put("file_size", new File(imagePath).length());
ofdimage.put("file_suffix",type);
}catch(Exception e){
e.printStackTrace();
}finally {
doc =null;
renderer =null;
image =null;
}
return ofdimage;
}
}
4.pdf文件解析
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class PDFKeyWordPosition extends PDFTextStripper {
private List<String> keywordList;
private Map<String, List<Position>> positionListMap;
public PDFKeyWordPosition() throws IOException {
super();
}
// 获取坐标信息
public Map<String, List<Position>> getCoordinate(List<String> keywordList, PDDocument document) throws IOException {
super.setSortByPosition(true);
this.keywordList = keywordList;
this.positionListMap = new HashMap<>();
super.setStartPage(1);
super.setEndPage(1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
super.writeText(document, dummy);
return positionListMap;
}
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
for (String keyword : keywordList) {
Integer foundIndex = 0;
List<Position> positionList = positionListMap.computeIfAbsent(keyword, k -> new ArrayList<>());
for (int i = 0; i < textPositions.size(); i++) {
TextPosition textPosition = textPositions.get(i);
String str = textPosition.getUnicode();
if (0 < str.length() && str.charAt(0) == keyword.charAt(foundIndex)) {
foundIndex++;
int count = foundIndex;
for (int j = foundIndex; j < keyword.length(); j++) {
if (i + j >= textPositions.size()) {
break;
} else {
String s = textPositions.get(i + j).getUnicode();
if (0 < s.length() && s.charAt(0) == keyword.charAt(j)) {
count++;
}
}
}
if (count == keyword.length()) {
foundIndex = 0;
Position position = new Position();
position.setX(textPosition.getX());
position.setY(textPosition.getY());
positionList.add(position);
positionListMap.put(keyword, positionList);
}
}
}
}
}
}
class Position {
public Position() {
}
public Position(float x, float y) {
super();
this.x = x;
this.y = y;
}
float x;
float y;
/**
* @return the x
*/
public float getX() {
return x;
}
/**
* @param x
* the x to set
*/
public void setX(float x) {
this.x = x;
}
/**
* @return the y
*/
public float getY() {
return y;
}
/**
* @param y
* the y to set
*/
public void setY(float y) {
this.y = y;
}
@Override
public String toString() {
return "Position [x=" + x + ", y=" + y + "]";
}
}
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class PDFKeyWordPosition extends PDFTextStripper {
private List<String> keywordList;
private Map<String, List<Position>> positionListMap;
public PDFKeyWordPosition() throws IOException {
super();
}
// 获取坐标信息
public Map<String, List<Position>> getCoordinate(List<String> keywordList, PDDocument document) throws IOException {
super.setSortByPosition(true);
this.keywordList = keywordList;
this.positionListMap = new HashMap<>();
super.setStartPage(1);
super.setEndPage(1);
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
super.writeText(document, dummy);
return positionListMap;
}
@Override
protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
for (String keyword : keywordList) {
Integer foundIndex = 0;
List<Position> positionList = positionListMap.computeIfAbsent(keyword, k -> new ArrayList<>());
for (int i = 0; i < textPositions.size(); i++) {
TextPosition textPosition = textPositions.get(i);
String str = textPosition.getUnicode();
if (0 < str.length() && str.charAt(0) == keyword.charAt(foundIndex)) {
foundIndex++;
int count = foundIndex;
for (int j = foundIndex; j < keyword.length(); j++) {
if (i + j >= textPositions.size()) {
break;
} else {
String s = textPositions.get(i + j).getUnicode();
if (0 < s.length() && s.charAt(0) == keyword.charAt(j)) {
count++;
}
}
}
if (count == keyword.length()) {
foundIndex = 0;
Position position = new Position();
position.setX(textPosition.getX());
position.setY(textPosition.getY());
positionList.add(position);
positionListMap.put(keyword, positionList);
}
}
}
}
}
}
class Position {
public Position() {
}
public Position(float x, float y) {
super();
this.x = x;
this.y = y;
}
float x;
float y;
/**
* @return the x
*/
public float getX() {
return x;
}
/**
* @param x
* the x to set
*/
public void setX(float x) {
this.x = x;
}
/**
* @return the y
*/
public float getY() {
return y;
}
/**
* @param y
* the y to set
*/
public void setY(float y) {
this.y = y;
}
@Override
public String toString() {
return "Position [x=" + x + ", y=" + y + "]";
}
}
import org.apache.commons.lang3.StringUtils;
import org.apache.pdfbox.Loader;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.PDFTextStripperByArea;
import java.awt.*;
import java.io.File;
import java.io.IOException;
import java.math.BigDecimal;
import java.util.*;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 专用于处理电子发票识别的类
*
* @author arthurlee
*
*/
public class PdfInvoiceExtractor {
/**
* 解析多页pdf全部
* @param file
* @return
* @throws IOException
*/
public static List<Invoice> extractList(File file) throws IOException{
ArrayList<Invoice> invoices = new ArrayList<>();
PDDocument doc = Loader.loadPDF(file);
for(int i=1;i<=doc.getNumberOfPages();i++){
invoices.add(extracts(i,file));
}
return invoices;
}
/**
* 解析pdf单页
* @param pageNum 页码(从1开始)
* @param file 文件地址
* @return
* @throws IOException
*/
public static Invoice extracts(int pageNum,File file) throws IOException{
Invoice invoice = new Invoice();
PDDocument doc = Loader.loadPDF(file);
PDPage firstPage = doc.getPage(pageNum-1);
int pageWidth = Math.round(firstPage.getCropBox().getWidth());
PDFTextStripper textStripper = new PDFTextStripper();
textStripper.setStartPage(pageNum);
textStripper.setEndPage(pageNum);
textStripper.setSortByPosition(true);
String fullText = textStripper.getText(doc);
if (firstPage.getRotation() != 0) {
pageWidth = Math.round(firstPage.getCropBox().getHeight());
}
String allText = replace(fullText).replaceAll("(", "(").replaceAll(")", ")").replaceAll("¥", "¥");
Pattern type = Pattern.compile("(?<p>\\S*)电.发票");
Matcher matcher1 = type.matcher(allText);
Pattern typedm = Pattern.compile("(?<p>\\S*)发票代码");
Matcher matcherdm = typedm.matcher(allText);
if (matcher1.find() && !matcherdm.find()) {电子发票
invoice.setTitle(matcher1.group());
{
Pattern type00Pattern = Pattern.compile("(?<p>\\S*)通发票");
Matcher m00 = type00Pattern.matcher(allText);
if (m00.find()) {
invoice.setTitle(m00.group("p")+ "通发票");
if (null == invoice.getType()) {
invoice.setType("普通发票");
invoice.setFormat("302");
}
} else {
Pattern type01Pattern = Pattern.compile("(?<p>\\S*)用发票");
Matcher m01 = type01Pattern.matcher(allText);
if (m01.find()) {
invoice.setTitle(m01.group("p")+ "用发票");
if (null == invoice.getType()) {
invoice.setType("专用发票");
invoice.setFormat("301");
}
}
}
}
{
String reg = "发票号码:(?<number>\\d{20})|:(?<date>\\d{4}年\\d{2}月\\d{2}日)";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(allText);
while (matcher.find()) {
if (matcher.group("number") != null) {
invoice.setNumber(matcher.group("number"));
} else if (matcher.group("date") != null) {
invoice.setDate(matcher.group("date"));
}
}
}
{
// String reg = "合计¥?(?<amount>[^ \\f\\n\\r\\t\\v\\*]\\S*)(¥?(?<taxAmount>\\S*)|\\*+)\\s"; ///合计¥245.28¥14.72
String reg = "合计¥?(?<amount>[^ \\f\\n\\r\\t\\v\\*]\\S*)¥";
String reg1 = "合计¥[\\s\\S]*?¥(?<taxAmount>\\S*)";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(allText);
Pattern pattern1 = Pattern.compile(reg1);
Matcher matcher2 = pattern1.matcher(allText);
if (matcher.find()) {
try {
invoice.setAmount(matcher.group("amount"));
} catch (Exception e) {
}
}
if (matcher2.find()) {
try {
invoice.setTaxAmount(matcher2.group("taxAmount"));
} catch (Exception e) {
invoice.setTaxAmount("0");
}
}
}
if (null == invoice.getAmount()) {
String reg = "合\\u0020*计\\u0020*¥?(?<amount>[^ ]*)\\u0020+¥?(?:(?<taxAmount>\\S*)|\\*+)\\s";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(fullText);
if (matcher.find()) {
try {
invoice.setAmount(matcher.group("amount"));
} catch (Exception e) {
invoice.setAmount("0");
}
try {
invoice.setTaxAmount(matcher.group("taxAmount"));
} catch (Exception e) {
invoice.setTaxAmount("0");
}
}
}
{
String reg = "价税合计\\u0028大写\\u0029(?<amountString>\\S*)\\u0028小写\\u0029¥?(?<amount>\\S*)\\s";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(allText);
if (matcher.find()) {
invoice.setTotalAmountString(matcher.group("amountString"));
try {
invoice.setTotalAmount(matcher.group("amount"));
} catch (Exception e) {
invoice.setTotalAmount("0");
}
}
if (StringUtils.isEmpty(invoice.getTotalAmount())) {
//
String regx = "价税合计\\u0028⼤写\\u0029(?<amountString>\\S*)\\u0028小写\\u0029¥?(?<amount>\\S*)\\s";
Pattern patternx = Pattern.compile(regx);
Matcher matcherx = patternx.matcher(allText);
if (matcherx.find()) {
invoice.setTotalAmountString(matcherx.group("amountString"));
try {
invoice.setTotalAmount(matcherx.group("amount"));
} catch (Exception e) {
invoice.setTotalAmount("0");
}
}
}
}
{
String reg = "收款人:(?<payee>);\\S*";
String reg1 = "复核:(?<reviewer>\\S*)|复核人:(?<reviewer1>\\S*)";
String reg2 = "开票人:(?<drawer>\\S*)";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(allText);
if (matcher.find()) {
invoice.setPayee(matcher.group("payee"));
}
Pattern pattern1 = Pattern.compile(reg1);
Matcher matchere = pattern1.matcher(allText);
if (matchere.find()) {
invoice.setReviewer(TecrunUtils.nullToStr(matchere.group("reviewer")) +TecrunUtils.nullToStr(matchere.group("reviewer1")));
}
Pattern pattern2 = Pattern.compile(reg2);
Matcher matcher2 = pattern2.matcher(allText);
if (matcher2.find()) {
invoice.setDrawer(matcher2.group("drawer"));
}
if (allText.indexOf("通行费") > 0 && allText.indexOf("车牌号") > 0) {
invoice.setType("通行费");
}
}
{
PDFKeyWordPosition kwp = new PDFKeyWordPosition();
Map<String, List<Position>> positionListMap = kwp
.getCoordinate(Arrays.asList("机器编号", "税率", "价税合计", "合计", "开票日期","项目名称", "规格型号", "车牌号", "开户行及账号","统一社会信用代码","税额","单位", "密", "码", "区"), doc);
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
PDFTextStripperByArea detailStripper = new PDFTextStripperByArea();
detailStripper.setSortByPosition(true);
{
Position machineNumber;
if (positionListMap.get("机器编号").size() > 0) {
machineNumber = positionListMap.get("机器编号").get(0);
} else {
machineNumber = positionListMap.get("开票日期").get(0);
machineNumber.setY(machineNumber.getY() + 30);
}
Position taxRate = positionListMap.get("税率").get(0);
Position taxRateAmount = null;
if (CollectionUtils.isNotEmpty(positionListMap.get("税额"))) {
taxRateAmount = positionListMap.get("税额").get(0);
}
Position totalAmount = positionListMap.get("价税合计").get(0);
Position amount = positionListMap.get("合计").get(0);
Position model = null;
if (!positionListMap.get("项目名称").isEmpty()) {
model = positionListMap.get("项目名称").get(0);
} else {
model = positionListMap.get("车牌号").get(0);
model.setX(model.getX() - 15);
}
List<Position> account = positionListMap.get("统一社会信用代码");
Position buyer;
Position seller;
if (account.size() < 2) {
buyer = new Position(51, 122);
seller = new Position(351, 122);
} else {
buyer = account.get(0);
seller = account.get(1);
}
/*int maqX = 370;
List<Position> mi = positionListMap.get("密");
List<Position> ma = positionListMap.get("码");
List<Position> qu = positionListMap.get("区");
for (int i = 0; i < mi.size(); i++) {
float x1 = mi.get(i).getX();
for (int j = 0; j < ma.size(); j++) {
float x2 = ma.get(j).getX();
if (Math.abs(x1 - x2) < 5) {
for (int k = 0; k < qu.size(); k++) {
float x3 = qu.get(k).getX();
if (Math.abs(x2 - x3) < 5) {
maqX = Math.round((x1 + x2 + x3) / 3);
}
}
}
}
}*/
{
int x = Math.round(model.getX()+100);
int y = Math.round(taxRateAmount!=null?taxRateAmount.getY():0) + 5; // 用税额的y坐标作参考
int h = Math.round(amount.getY()) - Math.round(taxRate.getY()) -20; // 价税合计的y坐标减去税率的y坐标
detailStripper.addRegion("detail", new Rectangle(0, y, pageWidth, h));
stripper.addRegion("detailName", new Rectangle(0, y, x, h));
stripper.addRegion("detailPrice", new Rectangle(x, y, pageWidth, h));
}
{
// int x = maqX + 10;
// int y = Math.round(machineNumber.getY()) + 10;
// int w = pageWidth - maqX - 10;
// int h = Math.round(taxRate.getY() - 5) - y;
// stripper.addRegion("password", new Rectangle(x, y, w, h));
// int x = Math.round(buyer.getX()) - 15; // 开户行及账号的x为参考
// int y = Math.round(machineNumber.getY()) + 10; // 机器编号的y坐标为参考
// int w = maqX - x - 5; // 密码区x坐标为参考
// int h = Math.round(buyer.getY()) - y + 20; // 开户行及账号的y坐标为参考
// stripper.addRegion("buyer", new Rectangle(x, y, w, h));
// int x = Math.round(seller.getX()) - 15; // 开户行及账号为x参考
// int y = Math.round(totalAmount.getY()) + 10; // 价税合计的y坐标为参考
// int w = maqX - x - 5; // 密码区的x为参考
// int h = Math.round(seller.getY()) - y + 20; // 开户行及账号的y为参考
// stripper.addRegion("seller", new Rectangle(x, y, w, h));
}
{
int x = Math.round(buyer.getX()) - 30; // 买方 统一社会信用代码及账号的x为参考
int y = Math.round(buyer.getY()) - 60; // 买方 统一社会信用代码及账号的y为参考
int w = Math.round(seller.getX() - x - 5); //
int h = Math.round(buyer.getY()) - y + 60; // 买方 统一社会信用代码 开户行及账号的y坐标为参考
stripper.addRegion("buyer", new Rectangle(x, y, w, h));
}
{
int x = Math.round(seller.getX()) -30; // 购方 统一社会信用代码 开户行及账号为x参考
int y = Math.round(seller.getY()) - 60; // 购方 统一社会信用代码 的y坐标为参考
int w = pageWidth - x - 5; //
int h = Math.round(seller.getY()) - y + 60; // 开户行及账号的y为参考
stripper.addRegion("seller", new Rectangle(x, y, w, h));
}
}
stripper.extractRegions(firstPage);
detailStripper.extractRegions(firstPage);
doc.close();
// invoice.setPassword(StringUtils.trim(stripper.getTextForRegion("password")));
String reg = "名称:(?<name>\\S*)|纳税人识别号:(?<code>\\S*)|地址、电话:(?<address>\\S*)|开户行及账号:(?<account>\\S*)|电子支付标识:(?<account2>\\S*)|统一社会信用代码[\\s\\S]*?号:(?<account3>\\S*)";
{
String buyer = replace(stripper.getTextForRegion("buyer"));
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(buyer);
while (matcher.find()) {
if (matcher.group("name") != null) {
invoice.setBuyerName(matcher.group("name"));
} else if (matcher.group("code") != null) {
invoice.setBuyerCode(matcher.group("code"));
} else if (matcher.group("address") != null) {
invoice.setBuyerAddress(matcher.group("address"));
} else if (matcher.group("account") != null) {
invoice.setBuyerAccount(matcher.group("account"));
} else if (matcher.group("account2") != null) {
invoice.setBuyerAccount(matcher.group("account2"));
}else if (matcher.group("account3") != null) {
invoice.setBuyerCode(matcher.group("account3"));
}
}
}
{
String seller = replace(stripper.getTextForRegion("seller"));
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(seller);
while (matcher.find()) {
if (matcher.group("name") != null) {
invoice.setSellerName(matcher.group("name"));
} else if (matcher.group("code") != null) {
invoice.setSellerCode(matcher.group("code"));
} else if (matcher.group("address") != null) {
invoice.setSellerAddress(matcher.group("address"));
} else if (matcher.group("account") != null) {
invoice.setSellerAccount(matcher.group("account"));
}else if (matcher.group("account3") != null) {
invoice.setSellerCode(matcher.group("account3"));
}
}
}
{
List<String> skipList = new ArrayList<>();
List<Detail> detailList = new ArrayList<>();
String[] detailPriceStringArray = stripper.getTextForRegion("detailPrice").replaceAll(" ", " ").replaceAll(" ", " ")
.replaceAll("\r", "").split("\\n");
for (String detailString : detailPriceStringArray) {
Detail detail = new Detail();
detail.setName("");
String[] itemArray = StringUtils.split(detailString, " ");
if (2 == itemArray.length) {
detail.setAmount(itemArray[0]);
detail.setTaxAmount(itemArray[1]);
detailList.add(detail);
} else if (2 < itemArray.length) {
detail.setAmount(itemArray[itemArray.length - 3]);
String taxRate = itemArray[itemArray.length - 2];
if (taxRate.indexOf("免税") > 0 || taxRate.indexOf("不征税") > 0 || taxRate.indexOf("出口零税率") > 0
|| taxRate.indexOf("普通零税率") > 0 || taxRate.indexOf("%") < 0) {
detail.setTaxRate("0");
detail.setTaxAmount("0");
} else {
BigDecimal rate = new BigDecimal(Integer.parseInt(taxRate.replaceAll("%", "")));
detail.setTaxRate(String.valueOf(rate.divide(new BigDecimal(100))));
detail.setTaxAmount( itemArray[itemArray.length - 1]);
}
for (int j = 0; j < itemArray.length - 3; j++) {
if (itemArray[j].matches("^(-?\\d+)(\\.\\d+)?$")) {
if (null == detail.getCount()) {
detail.setCount(itemArray[j]);
} else {
detail.setPrice(itemArray[j]);
}
} else {
if (itemArray.length >= j + 1 && !itemArray[j + 1].matches("^(-?\\d+)(\\.\\d+)?$")) {
detail.setUnit(itemArray[j + 1]);
detail.setModel(itemArray[j]);
j++;
} else if (itemArray[j].length() > 2) {
detail.setModel(itemArray[j]);
} else {
detail.setUnit(itemArray[j]);
}
}
}
detailList.add(detail);
} else {
skipList.add(detailString);
}
}
String[] detailNameStringArray = stripper.getTextForRegion("detailName").replaceAll(" ", " ").replaceAll(" ", " ")
.replaceAll("\r", "").split("\\n");
String[] detailStringArray = replace(detailStripper.getTextForRegion("detail")).replaceAll("\r", "").split("\\n");
int i = 0, j = 0, h = 0, m = 0;
Detail lastDetail = null;
for (String detailString : detailStringArray) {
if (m < detailNameStringArray.length) {
if (detailString.matches("\\S+\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*")
&& !detailString.matches("^ *\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*")
&& detailString.matches("\\S+\\d+%[\\-\\d]+\\S*")
|| detailStringArray.length > i + 1
&& detailStringArray[i + 1].matches("^ *\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*")) {
if (j < detailList.size()) {
lastDetail = detailList.get(j);
lastDetail.setName(detailNameStringArray[m]);
}
j++;
} else if (null != lastDetail && StringUtils.isNotBlank(detailNameStringArray[m])) {
if (skipList.size() > h) {
String skip = skipList.get(h);
if (detailString.endsWith(skip)) {
if (detailString.equals(skip)) {
m--;
} else {
lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]);
}
lastDetail.setModel(lastDetail.getModel() + skip);
h++;
} else {
lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]);
}
} else {
lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]);
}
}
}
i++;
m++;
}
invoice.setDetailList(detailList);
}
}
} else { ///电子普通发票 电子专用发票
Pattern typetong = Pattern.compile("(?<p>\\S*)通发票");
Matcher matchertong = typetong.matcher(allText);
Pattern typeyong = Pattern.compile("(?<p>\\S*)用发票");
Matcher matcheryong = typeyong.matcher(allText);
boolean flag=false;
if(matchertong.find() ){///通发票
flag=true;
}
if(matcheryong.find() ){///用发票
flag=true;
}
if (!flag){
return invoice;
}
{
String reg = "机器编号:(?<machineNumber>\\d{12})|发票代码:(?<code>\\d{12})|发票号码:(?<number>\\d{8})|:(?<date>\\d{4}年\\d{2}月\\d{2}日)"
+ "|校验码:(?<checksum>\\d{20}|\\S{4,})";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(allText);
while (matcher.find()) {
if (matcher.group("machineNumber") != null) {
invoice.setMachineNumber(matcher.group("machineNumber"));
} else if (matcher.group("code") != null) {
invoice.setCode(matcher.group("code"));
} else if (matcher.group("number") != null) {
invoice.setNumber(matcher.group("number"));
} else if (matcher.group("date") != null) {
invoice.setDate(matcher.group("date"));
} else if (matcher.group("checksum") != null) {
invoice.setChecksum(matcher.group("checksum"));
}
}
if (StringUtils.isBlank(invoice.getDate())) {
String kprqGrex = "开票日期:(\\d.*)日";
Pattern compile = Pattern.compile(kprqGrex);
Matcher matcher2 = compile.matcher(allText);
if (matcher2.find()) {
try {
invoice.setDate(matcher2.group(1));
} catch (Exception e) {
}
}
}
}
{
String reg = "合计¥?(?<amount>[^ \\f\\n\\r\\t\\v\\*]*)(?:¥?(?<taxAmount>\\S*)|\\*+)\\s";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(allText);
if (matcher.find()) {
try {
invoice.setAmount(matcher.group("amount"));
} catch (Exception e) {
}
try {
invoice.setTaxAmount(matcher.group("taxAmount"));
} catch (Exception e) {
invoice.setTaxAmount("0");
}
}
}
try {
String amountTest = "^\\d+(\\.\\d+)?$";
Pattern pattern = Pattern.compile(amountTest);
Matcher matcher = pattern.matcher(invoice.getAmount());
if (!matcher.find()) {
invoice.setAmount(null);
}
} catch (Exception e) {
}
if (null == invoice.getAmount()) {
String reg = "合\\u0020*计\\u0020*¥?(?<amount>[^ ]*)\\u0020+¥?(?:(?<taxAmount>\\S*)|\\*+)\\s";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(fullText);
if (matcher.find()) {
try {
invoice.setAmount(matcher.group("amount"));
if (StringUtils.isNotBlank(invoice.getAmount())) {
String grex = "¥(-?\\d*.?\\d+)";
Pattern pattern1 = Pattern.compile(grex);
Matcher matcher2 = pattern1.matcher(invoice.getAmount());
if (matcher2.find()) {
invoice.setAmount(matcher2.group(1));
}
}
} catch (Exception e) {
invoice.setAmount("0");
}
try {
invoice.setTaxAmount(matcher.group("taxAmount"));
if (StringUtils.isNotBlank(invoice.getTaxAmount())) {
String grex = "¥(-?\\d*.?\\d+)";
Pattern pattern1 = Pattern.compile(grex);
Matcher matcher2 = pattern1.matcher(invoice.getTaxAmount());
if (matcher2.find()) {
invoice.setTaxAmount(matcher2.group(1));
}
}
} catch (Exception e) {
invoice.setTaxAmount("0");
}
}
}
{
String reg = "价税合计\\u0028大写\\u0029(?<amountString>\\S*)\\u0028小写\\u0029¥?(?<amount>\\S*)\\s";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(allText);
if (matcher.find()) {
invoice.setTotalAmountString(matcher.group("amountString"));
try {
invoice.setTotalAmount(matcher.group("amount"));
} catch (Exception e) {
invoice.setTotalAmount("0");
}
}
}
{
String reg = "收款人:(?<payee>\\S*)复核:(?<reviewer>\\S*)开票人:(?<drawer>\\S*)销售方";
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(allText);
if (matcher.find()) {
invoice.setPayee(matcher.group("payee"));
invoice.setReviewer(matcher.group("reviewer"));
invoice.setDrawer(matcher.group("drawer"));
}
if (allText.indexOf("通行费") > 0 && allText.indexOf("车牌号") > 0) {
invoice.setType("通行费");
}
Pattern type00Pattern = Pattern.compile("(?<p>\\S*)通发票");
Matcher m00 = type00Pattern.matcher(allText);
if (m00.find()) {
invoice.setTitle(m00.group("p").replaceAll("(?:国|统|一|发|票|监|制)", "") + "通发票");
if (null == invoice.getType()) {
invoice.setType("普通发票");
invoice.setFormat("202");
}
} else {
Pattern type01Pattern = Pattern.compile("(?<p>\\S*)用发票");
Matcher m01 = type01Pattern.matcher(allText);
if (m01.find()) {
invoice.setTitle(m01.group("p").replaceAll("(?:国|统|一|发|票|监|制)", "") + "用发票");
if (null == invoice.getType()) {
invoice.setType("专用发票");
invoice.setFormat("201");
}
}
}
}
PDFKeyWordPosition kwp = new PDFKeyWordPosition();
Map<String, List<Position>> positionListMap = kwp
.getCoordinate(Arrays.asList("机器编号", "税率", "价税合计", "合计", "开票日期", "规格型号", "车牌号", "开户行及账号", "密", "码", "区"), doc);
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
PDFTextStripperByArea detailStripper = new PDFTextStripperByArea();
detailStripper.setSortByPosition(true);
{
Position machineNumber;
if (positionListMap.get("机器编号").size() > 0) {
machineNumber = positionListMap.get("机器编号").get(0);
} else {
machineNumber = positionListMap.get("开票日期").get(0);
machineNumber.setY(machineNumber.getY() + 30);
}
Position taxRate = positionListMap.get("税率").get(0);
Position totalAmount = positionListMap.get("价税合计").get(0);
Position amount = positionListMap.get("合计").get(0);
Position model = null;
if (!positionListMap.get("规格型号").isEmpty()) {
model = positionListMap.get("规格型号").get(0);
} else {
model = positionListMap.get("车牌号").get(0);
model.setX(model.getX() - 15);
}
List<Position> account = positionListMap.get("开户行及账号");
Position buyer;
Position seller;
if (account.size() < 2) {
buyer = new Position(51, 122);
seller = new Position(51, 341);
} else {
buyer = account.get(0);
seller = account.get(1);
}
int maqX = 370;
List<Position> mi = positionListMap.get("密");
List<Position> ma = positionListMap.get("码");
List<Position> qu = positionListMap.get("区");
for (int i = 0; i < mi.size(); i++) {
float x1 = mi.get(i).getX();
for (int j = 0; j < ma.size(); j++) {
float x2 = ma.get(j).getX();
if (Math.abs(x1 - x2) < 5) {
for (int k = 0; k < qu.size(); k++) {
float x3 = qu.get(k).getX();
if (Math.abs(x2 - x3) < 5) {
maqX = Math.round((x1 + x2 + x3) / 3);
}
}
}
}
}
{
int x = Math.round(model.getX()) - 13;
int y = Math.round(taxRate.getY()) + 5; // 用税率的y坐标作参考
int h = Math.round(amount.getY()) - Math.round(taxRate.getY()) - 25; // 价税合计的y坐标减去税率的y坐标
detailStripper.addRegion("detail", new Rectangle(0, y, pageWidth, h));
stripper.addRegion("detailName", new Rectangle(0, y, x, h));
stripper.addRegion("detailPrice", new Rectangle(x, y, pageWidth, h));
}
{
int x = maqX + 10;
int y = Math.round(machineNumber.getY()) + 10;
int w = pageWidth - maqX - 10;
int h = Math.round(taxRate.getY() - 5) - y;
stripper.addRegion("password", new Rectangle(x, y, w, h));
}
{
int x = Math.round(buyer.getX()) - 15; // 开户行及账号的x为参考
int y = Math.round(machineNumber.getY()) + 10; // 机器编号的y坐标为参考
int w = maqX - x - 5; // 密码区x坐标为参考
int h = Math.round(buyer.getY()) - y + 20; // 开户行及账号的y坐标为参考
stripper.addRegion("buyer", new Rectangle(x, y, w, h));
}
{
int x = Math.round(seller.getX()) - 15; // 开户行及账号为x参考
int y = Math.round(totalAmount.getY()) + 10; // 价税合计的y坐标为参考
int w = maqX - x - 5; // 密码区的x为参考
int h = Math.round(seller.getY()) - y + 20; // 开户行及账号的y为参考
stripper.addRegion("seller", new Rectangle(x, y, w, h));
}
}
stripper.extractRegions(firstPage);
detailStripper.extractRegions(firstPage);
doc.close();
invoice.setPassword(StringUtils.trim(stripper.getTextForRegion("password")));
String reg = "名称:(?<name>\\S*)|纳税人识别号:(?<code>\\S*)|地址、电话:(?<address>\\S*)|开户行及账号:(?<account>\\S*)|电子支付标识:(?<account2>\\S*)";
{
String buyer = replace(stripper.getTextForRegion("buyer"));
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(buyer);
while (matcher.find()) {
if (matcher.group("name") != null) {
invoice.setBuyerName(matcher.group("name"));
} else if (matcher.group("code") != null) {
invoice.setBuyerCode(matcher.group("code"));
} else if (matcher.group("address") != null) {
invoice.setBuyerAddress(matcher.group("address"));
} else if (matcher.group("account") != null) {
invoice.setBuyerAccount(matcher.group("account"));
} else if (matcher.group("account2") != null) {
invoice.setBuyerAccount(matcher.group("account2"));
}
}
}
{
String seller = replace(stripper.getTextForRegion("seller"));
Pattern pattern = Pattern.compile(reg);
Matcher matcher = pattern.matcher(seller);
while (matcher.find()) {
if (matcher.group("name") != null) {
invoice.setSellerName(matcher.group("name"));
} else if (matcher.group("code") != null) {
invoice.setSellerCode(matcher.group("code"));
} else if (matcher.group("address") != null) {
invoice.setSellerAddress(matcher.group("address"));
} else if (matcher.group("account") != null) {
invoice.setSellerAccount(matcher.group("account"));
}
}
}
{
List<String> skipList = new ArrayList<>();
List<Detail> detailList = new ArrayList<>();
String[] detailPriceStringArray = stripper.getTextForRegion("detailPrice").replaceAll(" ", " ").replaceAll(" ", " ")
.replaceAll("\r", "").split("\\n");
for (String detailString : detailPriceStringArray) {
Detail detail = new Detail();
detail.setName("");
String[] itemArray = StringUtils.split(detailString, " ");
if (2 == itemArray.length) {
detail.setAmount(itemArray[0]);
detail.setTaxAmount(itemArray[1]);
detailList.add(detail);
} else if (2 < itemArray.length) {
detail.setAmount(itemArray[itemArray.length - 3]);
String taxRate = itemArray[itemArray.length - 2];
if (taxRate.indexOf("免税") > 0 || taxRate.indexOf("不征税") > 0 || taxRate.indexOf("出口零税率") > 0
|| taxRate.indexOf("普通零税率") > 0 || taxRate.indexOf("%") < 0) {
detail.setTaxRate("0");
detail.setTaxAmount("0");
} else {
BigDecimal rate = new BigDecimal(Integer.parseInt(taxRate.replaceAll("%", "")));
detail.setTaxRate(String.valueOf(rate.divide(new BigDecimal(100))));
detail.setTaxAmount( itemArray[itemArray.length - 1]);
}
for (int j = 0; j < itemArray.length - 3; j++) {
if (itemArray[j].matches("^(-?\\d+)(\\.\\d+)?$")) {
if (null == detail.getCount()) {
detail.setCount(itemArray[j]);
} else {
detail.setPrice(itemArray[j]);
}
} else {
if (itemArray.length >= j + 1 && !itemArray[j + 1].matches("^(-?\\d+)(\\.\\d+)?$")) {
detail.setUnit(itemArray[j + 1]);
detail.setModel(itemArray[j]);
j++;
} else if (itemArray[j].length() > 2) {
detail.setModel(itemArray[j]);
} else {
detail.setUnit(itemArray[j]);
}
}
}
detailList.add(detail);
} else {
skipList.add(detailString);
}
}
String[] detailNameStringArray = stripper.getTextForRegion("detailName").replaceAll(" ", " ").replaceAll(" ", " ")
.replaceAll("\r", "").split("\\n");
String[] detailStringArray = replace(detailStripper.getTextForRegion("detail")).replaceAll("\r", "").split("\\n");
int i = 0, j = 0, h = 0, m = 0;
Detail lastDetail = null;
for (String detailString : detailStringArray) {
if (m < detailNameStringArray.length) {
if (detailString.matches("\\S+\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*")
&& !detailString.matches("^ *\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*")
&& detailString.matches("\\S+\\d+%[\\-\\d]+\\S*")
|| detailStringArray.length > i + 1
&& detailStringArray[i + 1].matches("^ *\\d*(%|免税|不征税|出口零税率|普通零税率)\\S*")) {
if (j < detailList.size()) {
lastDetail = detailList.get(j);
lastDetail.setName(detailNameStringArray[m]);
}
j++;
} else if (null != lastDetail && StringUtils.isNotBlank(detailNameStringArray[m])) {
if (skipList.size() > h) {
String skip = skipList.get(h);
if (detailString.endsWith(skip)) {
if (detailString.equals(skip)) {
m--;
} else {
lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]);
}
lastDetail.setModel(lastDetail.getModel() + skip);
h++;
} else {
lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]);
}
} else {
lastDetail.setName(lastDetail.getName() + detailNameStringArray[m]);
}
}
}
i++;
m++;
}
invoice.setDetailList(detailList);
}
}
return invoice;
}
private static Invoice parseGenerayInvoiceHandle(Invoice invoice, String allText) {
if (Objects.isNull(invoice) || StringUtils.isBlank(invoice.getNumber())) {
//执行解析逻辑
String regx = "^\\d{12}";
Pattern patternF = Pattern.compile(regx);
Matcher matcherF = patternF.matcher(allText);
String regxx = "^\\d{8}";
Pattern patternn2 = Pattern.compile(regxx);
Matcher matcherr3 = patternn2.matcher(allText);
if (matcherF.find() && matcherr3.find()) {
invoice.setNumber(matcherr3.group());
invoice.setCode(matcherF.group());
if (StringUtils.isBlank(invoice.getDate())) {
String kprqGrex = "\\d{4}年\\d{2}月\\d{2}日";
Pattern compile = Pattern.compile(kprqGrex);
Matcher matcher2 = compile.matcher(allText);
if (matcher2.find()) {
try {
invoice.setDate(matcher2.group(1));
} catch (Exception e) {
}
}
}
String regJym = "^\\d{32}";
Pattern pattern = Pattern.compile(regJym);
Matcher matcher = pattern.matcher(allText);
if (matcher.find()) {
String group = matcher.group();
if (StringUtils.isNotBlank(group)) {
String substring = group.substring(12);
invoice.setChecksum(substring);
}
}
} else {
return invoice;
}
}
return invoice;
}
/**
* 解析pdf首页
* @param file
* @return
* @throws IOException
*/
public static Invoice extract(File file) throws IOException {
return extracts(1,file);
}
public static String replace(String str) {
return str.replaceAll(" ", "").replaceAll(" ", "").replaceAll(":", ":").replaceAll(" ", "");
}
public static void main(String[] args) {
try {
File file = new File("D:\\Desktop\\16.1.pdf");
List<Invoice> invoices = PdfInvoiceExtractor.extractList(file);
invoices.stream().forEach(e->{
System.err.println(e);
System.err.println("---------");
});
System.err.println("--------------");
System.err.println(extracts(1,file));
} catch (IOException e) {
e.printStackTrace();
}
}