java pdf 识别表格内容 识别空格
maven 依赖
只有版本2.0.0以上的 pdfbox版本与此版本的PDFLayoutTextStripper.java兼容
<dependency
<groupId io.github.jonathanlink</groupId
<artifactId PDFLayoutTextStripper</artifactId
<version 2.2.3</version
</dependency
package pdf;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDRectangle;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.apache.pdfbox.text.TextPositionComparator;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
public class PDFLayoutTextStripper extends PDFTextStripper {
public static final boolean DEBUG = false;
public static final int OUTPUT_SPACE_CHARACTER_WIDTH_IN_PT = 4;
private double currentPageWidth;
private TextPosition previousTextPosition;
private List<TextLine> textLineList;
/**
* Constructor
*/
public PDFLayoutTextStripper() throws IOException {
super();
this.previousTextPosition = null;
this.textLineList = new ArrayList<TextLine>();
}
/**
*
* @param page page to parse
*/
@Override
public void processPage(PDPage page) throws IOException {
PDRectangle pageRectangle = page.getMediaBox();
if (pageRectangle!= null) {
this.setCurrentPageWidth(pageRectangle.getWidth());
super.processPage(page);
this.previousTextPosition = null;
this.textLineList = new ArrayList<TextLine>();
}
}
@Override
protected void writePage() throws IOException {
List<List<TextPosition>> charactersByArticle = super.getCharactersByArticle();
for( int i = 0; i < charactersByArticle.size(); i++) {
List<TextPosition> textList = charactersByArticle.get(i);
try {
this.sortTextPositionList(textList);
} catch ( IllegalArgumentException e) {
System.err.println(e);
}
this.iterateThroughTextList(textList.iterator()) ;
}
this.writeToOutputStream(this.getTextLineList());
}
private void writeToOutputStream(final List<TextLine> textLineList) throws IOException {
for (TextLine textLine : textLineList) {
char[] line = textLine.getLine().toCharArray();
super.getOutput().write(line);
super.getOutput().write('\n');
super.getOutput().flush();
}
}
/*
* In order to get rid of the warning:
* TextPositionComparator class should implement Comparator<TextPosition> instead of Comparator
*/
@SuppressWarnings("unchecked")
private void sortTextPositionList(final List<TextPosition> textList) {
TextPositionComparator comparator = new TextPositionComparator();
Collections.sort(textList, comparator);
}
private void writeLine(final List<TextPosition> textPositionList) {
if ( textPositionList.size() > 0 ) {
TextLine textLine = this.addNewLine();
boolean firstCharacterOfLineFound = false;
for (TextPosition textPosition : textPositionList ) {
CharacterFactory characterFactory = new CharacterFactory(firstCharacterOfLineFound);
Character character = c