前提
公司其它部门同事有一批PDF文件,希望能从这批PDF文件中提取出图片和表格。
PDF文件示例:
提取出的图片:
提取出的表格:
实现分析
图片的提取,可以使用spire.pdf(官网:https://www.e-iceblue.cn);测试下来,效果OK。
表格的提取,没有现成的工具,依据PDF的特点,表格的字段和布局规范,可通过坐标(x,y)来识别并提取。提取时,需对捕捉到的错列、错行等异常进行人工处理。
主要Maven
<repositories>
<repository>
<id>com.e-iceblue</id>
<name>e-iceblue</name>
<url>https://repo.e-iceblue.cn/repository/maven-public/</url>
</repository>
</repositories>
<dependencies>
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itextpdf</artifactId>
<version>5.5.1</version>
</dependency>
<dependency>
<groupId>e-iceblue</groupId>
<artifactId>spire.pdf.free</artifactId>
<version>5.1.0</version>
</dependency>
</dependencies>
核心代码
public class PDF2DataTest {
public static final Logger LOGGER = LoggerFactory.getLogger(PDF2DataTest.class);
public void extractImages(String pdfName, String pdfFilePath, String imageFileFolder) throws Exception {
String imageFilePath = imageFileFolder + File.separator + pdfName + ".png";
File imageFile = new File(imageFilePath);
if (imageFile.exists()) {
return;
}
PdfDocument pdfDocument = new PdfDocument();
pdfDocument.loadFromFile(pdfFilePath);
for (PdfPageBase pdfPageBase : (Iterable<PdfPageBase>) pdfDocument.getPages()) {
try {
for (BufferedImage imageBuffer : pdfPageBase.extractImages()) {
ImageIO.write(imageBuffer, "PNG", imageFile);
return;
}
} catch (Exception ex) {
// do nothing
continue;
}
}
}
public static String EXCEL_TITLE_STR = "图号起始年终止年正选装负选装配件名称配件号左/右单量";
public void extractText(List<String[]> dataStrsList, String pdfName, String pdfFilePath) throws Exception {
PdfReader pdfReader = new PdfReader(pdfFilePath);
pdfReader.setAppendable(false);
for (int i = 1; i <= pdfReader.getNumberOfPages(); i++) {
String excelText = PdfTextExtractor.getTextFromPage(pdfReader, i);
if (!excelText.replace(" ", "").contains(EXCEL_TITLE_STR)) {
continue;
}
PdfReaderContentParser parser = new PdfReaderContentParser(pdfReader);
CustomRenderListener renderListener = new CustomRenderListener();
parser.processContent(i, renderListener);
List<Coordinate> coordinateList = renderListener.getCoordinateList();
extractText(dataStrsList, pdfName, coordinateList);
}
pdfReader.close();
}
public static String[] EXCEL_TITLE = new String[]{"PDF", "IMAGE", "图号", "起始年", "终止年", "正选装", "负选装", "配件名称", "配件号", "左/右", "单量"};
public void exportData(List<String[]> dataStrsList, String excelName, String excelFileFolder) throws Exception {
SXSSFWorkbook workbook = new SXSSFWorkbook();
workbook.setCompressTempFiles(true);
Sheet sheet = workbook.createSheet("data");
Row row = sheet.createRow(0);
for (int i = 0; i < EXCEL_TITLE.length; i++) {
Cell cell = row.createCell(i);
cell.setCellValue(EXCEL_TITLE[i]);
}
for (int i = 0; i < dataStrsList.size(); i++) {
String[] dataStrs = dataStrsList.get(i);
Row nextrow = sheet.createRow(i + 1);
for (int x = 0; x < EXCEL_TITLE.length; x++) {
Cell cell = nextrow.createCell(x);
cell.setCellValue(dataStrs[x]);
}
}
File file = new File(excelFileFolder + File.separator + excelName + ".xlsx");
if (file.exists()) {
file.delete();
}
file.createNewFile();
FileOutputStream stream = FileUtils.openOutputStream(file);
workbook.write(stream);
stream.flush();
stream.close();
workbook.dispose();
}
public List<String[]> extractText(List<String[]> dataStrsList, String pdfName, List<Coordinate> coordinateAllList) throws Exception {
Map<Float, List<Coordinate>> coordinateMap = new TreeMap<>();
for (Coordinate coordinate : coordinateAllList) {
if (!coordinateMap.containsKey(coordinate.getY())) {
coordinateMap.put(coordinate.getY(), new ArrayList<>());
}
coordinateMap.get(coordinate.getY()).add(coordinate);
}
List<Float> keyList = new ArrayList<>(coordinateMap.keySet());
float[] fieldX = new float[9];
boolean isNextData = false;
float yAllInit = 0;
int dataFloor = 0;
List<Coordinate> coordinateTmpList = new ArrayList<>();
for (int i = keyList.size() - 1; i >= 0; ) {
List<Coordinate> coordinateOneList = coordinateMap.get(keyList.get(i));
i = i - 1;
if ((coordinateOneList.size() == 9 || coordinateOneList.size() == 10)
&& "图号".equals(coordinateOneList.get(0).getText())
&& "起始年".equals(coordinateOneList.get(1).getText())
&& "终止年".equals(coordinateOneList.get(2).getText())) {
fieldX[0] = coordinateOneList.get(0).getX();
fieldX[1] = coordinateOneList.get(1).getX();
fieldX[2] = coordinateOneList.get(2).getX();
fieldX[3] = coordinateOneList.get(3).getX();
fieldX[4] = coordinateOneList.get(4).getX();
fieldX[5] = coordinateOneList.get(5).getX();
fieldX[6] = coordinateOneList.get(6).getX();
fieldX[7] = coordinateOneList.get(7).getX();
if (coordinateOneList.size() == 9) {
fieldX[8] = coordinateOneList.get(8).getX();
} else {
fieldX[8] = coordinateOneList.get(9).getX();
}
isNextData = true;
} else if (isNextData) {
if (coordinateOneList.get(0).getX() >= fieldX[0] - 10
&& coordinateOneList.get(0).getX() <= fieldX[0] + 10) {
coordinateTmpList.addAll(coordinateOneList);
String[] dataFinalStrs = new String[]{pdfName + ".pdf", pdfName + ".png", "", "", "", "", "", "", "", "", ""};
if (dataFloor > 0) {
float yInit = coordinateOneList.get(0).getY();
for (int x = 0; x < dataFloor; x++) {
List<Coordinate> coordinateOneTmpList = coordinateMap.get(keyList.get(i));
if (yInit - coordinateOneTmpList.get(0).getY() < 11.0) {
yInit = coordinateOneTmpList.get(0).getY();
i = i - 1;
if (coordinateOneTmpList.get(0).getX() > 15 && coordinateOneTmpList.get(0).getX() < 35
&& coordinateOneTmpList.get(0).getText().startsWith("h")) {
coordinateOneTmpList = new ArrayList<>();
}
coordinateTmpList.addAll(coordinateOneTmpList);
} else {
LOGGER.warn("!!!!!!!PDF出现错行!PDF名称:{}", pdfName);
break;
}
}
}
coordinate2Data(pdfName, dataFinalStrs, fieldX, coordinateTmpList);
dataStrsList.add(dataFinalStrs);
yAllInit = 0;
dataFloor = 0;
coordinateTmpList = new ArrayList<>();
} else {
if (coordinateOneList.get(0).getX() > 15 && coordinateOneList.get(0).getX() < 35
&& coordinateOneList.get(0).getText().startsWith("h")) {
break;
} else {
if (coordinateTmpList.size() <= 0 || yAllInit == 0
|| yAllInit - coordinateOneList.get(0).getY() < 11.0) {
yAllInit = coordinateOneList.get(0).getY();
dataFloor = dataFloor + 1;
coordinateTmpList.addAll(coordinateOneList);
} else {
coordinate2Data(pdfName, dataStrsList.get(dataStrsList.size() - 1), fieldX, coordinateTmpList);
yAllInit = 0;
dataFloor = 0;
coordinateTmpList = new ArrayList<>();
yAllInit = coordinateOneList.get(0).getY();
dataFloor = dataFloor + 1;
coordinateTmpList.addAll(coordinateOneList);
}
}
}
}
}
if (coordinateTmpList.size() > 0) {
coordinate2Data(pdfName, dataStrsList.get(dataStrsList.size() - 1), fieldX, coordinateTmpList);
}
return dataStrsList;
}
public void coordinate2Data(String pdfName, String[] dataFinalStrs, float[] fieldX, List<Coordinate> coordinateList) {
for (Coordinate coordinate : coordinateList) {
// 图号:0
if (coordinate.getX() > fieldX[0] - 10 && coordinate.getX() < fieldX[0] + 20) {
dataFinalStrs[2] = dataFinalStrs[2] + coordinate.getText();
}
// 起始年:1
else if (coordinate.getX() > fieldX[1] - 10 && coordinate.getX() < fieldX[1] + 30) {
dataFinalStrs[3] = dataFinalStrs[3] + coordinate.getText();
}
// 终止年:2
else if (coordinate.getX() > fieldX[2] - 10 && coordinate.getX() < fieldX[2] + 30) {
dataFinalStrs[4] = dataFinalStrs[4] + coordinate.getText();
}
// 配件号:6
else if (coordinate.getX() > fieldX[6] - 20 && coordinate.getX() < fieldX[6] + 40) {
dataFinalStrs[8] = dataFinalStrs[8] + coordinate.getText();
}
// 左/右:7
else if (coordinate.getX() > fieldX[7] - 10 && coordinate.getX() < fieldX[7] + 20) {
dataFinalStrs[9] = dataFinalStrs[9] + coordinate.getText();
}
// 单量:8
else if (coordinate.getX() > fieldX[8] - 10 && coordinate.getX() < fieldX[8] + 20) {
dataFinalStrs[10] = dataFinalStrs[10] + coordinate.getText();
}
// 负选装:4
else if (coordinate.getX() > fieldX[4] - 40 && coordinate.getX() < fieldX[4] + 30) {
dataFinalStrs[6] = dataFinalStrs[6] + coordinate.getText();
}
// 配件名称:5
else if (coordinate.getX() > fieldX[4] + 30 && coordinate.getX() < fieldX[6] - 20) {
dataFinalStrs[7] = dataFinalStrs[7] + coordinate.getText();
}
// 正选装:3
else if (coordinate.getX() > fieldX[2] + 30 && coordinate.getX() < fieldX[4] - 40) {
dataFinalStrs[5] = dataFinalStrs[5] + coordinate.getText();
}
// 其它
else {
LOGGER.warn("!!!!XY轴溢出!PDF名称:{},溢出内容:{}", pdfName, coordinate.toString());
}
}
}
class CustomRenderListener implements RenderListener {
List<Coordinate> coordinateList = new ArrayList<>();
public List<Coordinate> getCoordinateList() {
return coordinateList;
}
@Override
public void beginTextBlock() {
}
@Override
public void endTextBlock() {
}
@Override
public void renderImage(ImageRenderInfo arg0) {
}
@Override
public void renderText(TextRenderInfo textRenderInfo) {
Rectangle2D.Float boundingRectange = textRenderInfo.getBaseline().getBoundingRectange();
coordinateList.add(new Coordinate(textRenderInfo.getText(),
boundingRectange.x,
boundingRectange.y));
}
}
class Coordinate {
String text;
float x;
float y;
Coordinate(String text, float x, float y) {
this.text = text;
this.x = x;
this.y = y;
}
public String getText() {
return text;
}
public float getX() {
return x;
}
public float getY() {
return y;
}
@Override
public String toString() {
return "Coordinate{" +
"text='" + text + '\'' +
", x=" + x +
", y=" + y +
'}';
}
}
public static void main(String[] args) throws Exception {
String pdfFileFolder = "D:\\fan.zhou\\可行性研究\\正时\\Tmp-PDF提取数据\\别克\\插电式混动别克微蓝6\\配件信息截图";
String imageFileFolder = "D:\\fan.zhou\\可行性研究\\正时\\Tmp-PDF提取数据\\别克\\插电式混动别克微蓝6\\Extract\\Image";
String excelName = "插电式混动别克微蓝6";
String excelFileFolder = "D:\\fan.zhou\\可行性研究\\正时\\Tmp-PDF提取数据\\别克\\插电式混动别克微蓝6\\Extract";
PDF2DataTest pdf2DataTest = new PDF2DataTest();
File[] pdfFiles = new File(pdfFileFolder).listFiles();
List<String[]> dataStrsList = new ArrayList<>();
for (File pdfFile : pdfFiles) {
String pdfName = pdfFile.getName().replace(".pdf", "");
String pdfAbsolutePath = pdfFile.getAbsolutePath();
// LOGGER.info("当前处理的文件:{}", pdfName);
pdf2DataTest.extractImages(pdfName, pdfAbsolutePath, imageFileFolder);
pdf2DataTest.extractText(dataStrsList, pdfName, pdfAbsolutePath);
}
pdf2DataTest.exportData(dataStrsList, excelName, excelFileFolder);
}
}