java parse pdf document

1. Maven dependencies:

<dependency>

              <groupId>org.apache.pdfbox</groupId>

              <artifactId>pdfbox</artifactId>

              <version>2.0.7</version>

          </dependency>

          <dependency>

              <groupId>com.itextpdf</groupId>

              <artifactId>itextpdf</artifactId>

              <version>5.5.10</version>

          </dependency>

2. Override dependency class function:

package operatePdf;


import java.util.ArrayList;
import java.util.Collections;
import java.util.List;


import com.itextpdf.text.pdf.parser.FilteredRenderListener;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.LineSegment;
import com.itextpdf.text.pdf.parser.LocationTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.Matrix;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
import com.itextpdf.text.pdf.parser.Vector;


public class MyLocationTextExtractionStrategy implements TextExtractionStrategy {


/** set to true for debugging */
static boolean DUMP_STATE = false;


/** a summary of all found text */
private final List<TextChunk> locationalResult = new ArrayList<TextChunk>();


private final TextChunkLocationStrategy tclStrat;


/**
* Creates a new text extraction renderer.
*/
public MyLocationTextExtractionStrategy() {
this(new TextChunkLocationStrategy() {
@Override
public TextChunkLocation createLocation(TextRenderInfo renderInfo, LineSegment baseline) {
return new TextChunkLocationDefaultImp(baseline.getStartPoint(), baseline.getEndPoint(),
renderInfo.getSingleSpaceWidth());
}
});
}


/**
* Creates a new text extraction renderer, with a custom strategy for
* creating new TextChunkLocation objects based on the input of the
* TextRenderInfo.

* @param strat
*            the custom strategy
*/
public MyLocationTextExtractionStrategy(TextChunkLocationStrategy strat) {
tclStrat = strat;
}


/**
* @see com.itextpdf.text.pdf.parser.RenderListener#beginTextBlock()
*/
@Override
public void beginTextBlock() {
}


/**
* @see com.itextpdf.text.pdf.parser.RenderListener#endTextBlock()
*/
@Override
public void endTextBlock() {
}


/**
* @param str
* @return true if the string starts with a space character, false if the
*         string is empty or starts with a non-space character
*/
private boolean startsWithSpace(String str) {
if (str.length() == 0)
return false;
return str.charAt(0) == ' ';
}


/**
* @param str
* @return true if the string ends with a space character, false if the
*         string is empty or ends with a non-space character
*/
private boolean endsWithSpace(String str) {
if (str.length() == 0)
return false;
return str.charAt(str.length() - 1) == ' ';
}


/**
* Filters the provided list with the provided filter

* @param textChunks
*            a list of all TextChunks that this strategy found during
*            processing
* @param filter
*            the filter to apply. If null, filtering will be skipped.
* @return the filtered list
* @since 5.3.3
*/
private List<TextChunk> filterTextChunks(List<TextChunk> textChunks, TextChunkFilter filter) {
if (filter == null)
return textChunks;


List<TextChunk> filtered = new ArrayList<TextChunk>();
for (TextChunk textChunk : textChunks) {
if (filter.accept(textChunk))
filtered.add(textChunk);
}
return filtered;
}


/**
* Determines if a space character should be inserted between a previous
* chunk and the current chunk. This method is exposed as a callback so
* subclasses can fine time the algorithm for determining whether a space
* should be inserted or not. By default, this method will insert a space if
* the there is a gap of more than half the font space character width
* between the end of the previous chunk and the beginning of the current
* chunk. It will also indicate that a space is needed if the starting point
* of the new chunk appears *before* the end of the previous chunk (i.e.
* overlapping text).

* @param chunk
*            the new chunk being evaluated
* @param previousChunk
*            the chunk that appeared immediately before the current chunk
* @return true if the two chunks represent different words (i.e. should
*         have a space between them). False otherwise.
*/
protected boolean isChunkAtWordBoundary(TextChunk chunk, TextChunk previousChunk) {
return chunk.getLocation().isAtWordBoundary(previousChunk.getLocation());
}


/**
* Gets text that meets the specified filter If multiple text extractions
* will be performed for the same page (i.e. for different physical regions
* of the page), filtering at this level is more efficient than filtering
* using {@link FilteredRenderListener} - but not nearly as powerful because
* most of the RenderInfo state is not captured in {@link TextChunk}

* @param chunkFilter
*            the filter to to apply
* @return the text results so far, filtered using the specified filter
*/
public String getResultantText(TextChunkFilter chunkFilter) {
if (DUMP_STATE)
dumpState();


List<TextChunk> filteredTextChunks = filterTextChunks(locationalResult, chunkFilter);
Collections.sort(filteredTextChunks);


StringBuilder sb = new StringBuilder();
TextChunk lastChunk = null;
for (TextChunk chunk : filteredTextChunks) {


if (lastChunk == null) {
sb.append(chunk.text);
} else {
if (chunk.sameLine(lastChunk)) {
// we only insert a blank space if the trailing character of
// the previous string wasn't a space, and the leading
// character of the current string isn't a space
if (isChunkAtWordBoundary(chunk, lastChunk) && !startsWithSpace(chunk.text)
&& !endsWithSpace(lastChunk.text))
sb.append('|');


sb.append(chunk.text);
} else {
sb.append('\n');
sb.append(chunk.text);
}
}
lastChunk = chunk;
}


return sb.toString();
}


/**
* Returns the result so far.

* @return a String with the resulting text.
*/
@Override
public String getResultantText() {


return getResultantText(null);


}


/** Used for debugging only */
private void dumpState() {
for (TextChunk location : locationalResult) {
location.printDiagnostics();


System.out.println();
}


}


/**

* @see com.itextpdf.text.pdf.parser.RenderListener#renderText(com.itextpdf.text.pdf.parser.TextRenderInfo)
*/
@Override
public void renderText(TextRenderInfo renderInfo) {
LineSegment segment = renderInfo.getBaseline();
if (renderInfo.getRise() != 0) { // remove the rise from the baseline -
// we do this because the text from
// a super/subscript render
// operations should probably be
// considered as part of the
// baseline of the text the
// super/sub is relative to
Matrix riseOffsetTransform = new Matrix(0, -renderInfo.getRise());
segment = segment.transformBy(riseOffsetTransform);
}
TextChunk tc = new TextChunk(renderInfo.getText(), tclStrat.createLocation(renderInfo, segment));
locationalResult.add(tc);
}


public static interface TextChunkLocationStrategy {
TextChunkLocation createLocation(TextRenderInfo renderInfo, LineSegment baseline);
}


public static interface TextChunkLocation extends Comparable<TextChunkLocation> {


float distParallelEnd();


float distParallelStart();


int distPerpendicular();


float getCharSpaceWidth();


Vector getEndLocation();


Vector getStartLocation();


int orientationMagnitude();


boolean sameLine(TextChunkLocation as);


float distanceFromEndOf(TextChunkLocation other);


boolean isAtWordBoundary(TextChunkLocation previous);
}


public static class TextChunkLocationDefaultImp implements TextChunkLocation {


/** the starting location of the chunk */
private final Vector startLocation;
/** the ending location of the chunk */
private final Vector endLocation;
/** unit vector in the orientation of the chunk */
private final Vector orientationVector;
/** the orientation as a scalar for quick sorting */
private final int orientationMagnitude;
/**
* perpendicular distance to the orientation unit vector (i.e. the Y
* position in an unrotated coordinate system) we round to the nearest
* integer to handle the fuzziness of comparing floats
*/
private final int distPerpendicular;
/**
* distance of the start of the chunk parallel to the orientation unit
* vector (i.e. the X position in an unrotated coordinate system)
*/
private final float distParallelStart;
/**
* distance of the end of the chunk parallel to the orientation unit
* vector (i.e. the X position in an unrotated coordinate system)
*/
private final float distParallelEnd;
/** the width of a single space character in the font of the chunk */
private final float charSpaceWidth;


public TextChunkLocationDefaultImp(Vector startLocation, Vector endLocation, float charSpaceWidth) {
this.startLocation = startLocation;
this.endLocation = endLocation;
this.charSpaceWidth = charSpaceWidth;


Vector oVector = endLocation.subtract(startLocation);
if (oVector.length() == 0) {
oVector = new Vector(1, 0, 0);
}
orientationVector = oVector.normalize();
orientationMagnitude = (int) (Math.atan2(orientationVector.get(Vector.I2), orientationVector.get(Vector.I1))
* 1000);


// see
// http://mathworld.wolfram.com/Point-LineDistance2-Dimensional.html
// the two vectors we are crossing are in the same plane, so the
// result will be purely
// in the z-axis (out of plane) direction, so we just take the I3
// component of the result
Vector origin = new Vector(0, 0, 1);
distPerpendicular = (int) (startLocation.subtract(origin)).cross(orientationVector).get(Vector.I3);


distParallelStart = orientationVector.dot(startLocation);
distParallelEnd = orientationVector.dot(endLocation);
}


@Override
public int orientationMagnitude() {
return orientationMagnitude;
}


@Override
public int distPerpendicular() {
return distPerpendicular;
}


@Override
public float distParallelStart() {
return distParallelStart;
}


@Override
public float distParallelEnd() {
return distParallelEnd;
}


/**
* @return the start location of the text
*/
@Override
public Vector getStartLocation() {
return startLocation;
}


/**
* @return the end location of the text
*/
@Override
public Vector getEndLocation() {
return endLocation;
}


/**
* @return the width of a single space character as rendered by this
*         chunk
*/
@Override
public float getCharSpaceWidth() {
return charSpaceWidth;
}


/**
* @param as
*            the location to compare to
* @return true is this location is on the the same line as the other
*/
@Override
public boolean sameLine(TextChunkLocation as) {
return orientationMagnitude() == as.orientationMagnitude() && distPerpendicular() == as.distPerpendicular();
}


/**
* Computes the distance between the end of 'other' and the beginning of
* this chunk in the direction of this chunk's orientation vector. Note
* that it's a bad idea to call this for chunks that aren't on the same
* line and orientation, but we don't explicitly check for that
* condition for performance reasons.

* @param other
* @return the number of spaces between the end of 'other' and the
*         beginning of this chunk
*/
@Override
public float distanceFromEndOf(TextChunkLocation other) {
float distance = distParallelStart() - other.distParallelEnd();
return distance;
}


@Override
public boolean isAtWordBoundary(TextChunkLocation previous) {
/**
* Here we handle a very specific case which in PDF may look like:
* -.232 Tc [(
* P)-226.2(r)-231.8(e)-230.8(f)-238(a)-238.9(c)-228.9(e)]TJ The
* font's charSpace width is 0.232 and it's compensated with
* charSpacing of 0.232. And a resultant TextChunk.charSpaceWidth
* comes to TextChunk constructor as 0. In this case every chunk is
* considered as a word boundary and space is added. We should
* consider charSpaceWidth equal (or close) to zero as a no-space.
*/
if (getCharSpaceWidth() < 0.1f)
return false;


float dist = distanceFromEndOf(previous);


return dist < -getCharSpaceWidth() || dist > getCharSpaceWidth() / 2.0f;
}


@Override
public int compareTo(TextChunkLocation other) {
if (this == other)
return 0; // not really needed, but just in case


int rslt;
rslt = compareInts(orientationMagnitude(), other.orientationMagnitude());
if (rslt != 0)
return rslt;


rslt = compareInts(distPerpendicular(), other.distPerpendicular());
if (rslt != 0)
return rslt;


return Float.compare(distParallelStart(), other.distParallelStart());
}
}


/**
* Represents a chunk of text, it's orientation, and location relative to
* the orientation vector
*/
public static class TextChunk implements Comparable<TextChunk> {
/** the text of the chunk */
private final String text;
private final TextChunkLocation location;


public TextChunk(String string, Vector startLocation, Vector endLocation, float charSpaceWidth) {
this(string, new TextChunkLocationDefaultImp(startLocation, endLocation, charSpaceWidth));
}


public TextChunk(String string, TextChunkLocation loc) {
this.text = string;
this.location = loc;
}


/**
* @return the text captured by this chunk
*/
public String getText() {
return text;
}


/**
* @return an object holding location data about this TextChunk
*/
public TextChunkLocation getLocation() {
return location;
}


/**
* @return the start location of the text
*/
public Vector getStartLocation() {
return location.getStartLocation();
}


/**
* @return the end location of the text
*/
public Vector getEndLocation() {
return location.getEndLocation();
}


/**
* @return the width of a single space character as rendered by this
*         chunk
*/
public float getCharSpaceWidth() {
return location.getCharSpaceWidth();
}


/**
* Computes the distance between the end of 'other' and the beginning of
* this chunk in the direction of this chunk's orientation vector. Note
* that it's a bad idea to call this for chunks that aren't on the same
* line and orientation, but we don't explicitly check for that
* condition for performance reasons.

* @param other
*            the other {@link TextChunk}
* @return the number of spaces between the end of 'other' and the
*         beginning of this chunk
*/
public float distanceFromEndOf(TextChunk other) {
return location.distanceFromEndOf(other.location);
}


private void printDiagnostics() {
System.out.println(
"Text (@" + location.getStartLocation() + " -> " + location.getEndLocation() + "): " + text);
System.out.println("orientationMagnitude: " + location.orientationMagnitude());
System.out.println("distPerpendicular: " + location.distPerpendicular());
System.out.println("distParallel: " + location.distParallelStart());
}


/**
* Compares based on orientation, perpendicular distance, then parallel
* distance

* @param rhs
*            the other object
* @see java.lang.Comparable#compareTo(java.lang.Object)
*/
@Override
public int compareTo(TextChunk rhs) {
return location.compareTo(rhs.location);
}


private boolean sameLine(TextChunk lastChunk) {
return getLocation().sameLine(lastChunk.getLocation());
}
}


/**
*
* @param int1
* @param int2
* @return comparison of the two integers
*/
private static int compareInts(int int1, int int2) {
return int1 == int2 ? 0 : int1 < int2 ? -1 : 1;
}


/**
* no-op method - this renderer isn't interested in image events

* @see com.itextpdf.text.pdf.parser.RenderListener#renderImage(com.itextpdf.text.pdf.parser.ImageRenderInfo)
* @since 5.0.1
*/
@Override
public void renderImage(ImageRenderInfo renderInfo) {
// do nothing
}


/**
* Specifies a filter for filtering {@link TextChunk} objects during text
* extraction

* @see LocationTextExtractionStrategy#getResultantText(TextChunkFilter)
* @since 5.3.3
*/
public static interface TextChunkFilter {
/**
* @param textChunk
*            the chunk to check
* @return true if the chunk should be allowed
*/
public boolean accept(TextChunk textChunk);
}
}

3. code example:

package operatePdf;


import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;


import org.apache.pdfbox.io.RandomAccessFile;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;


import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfTextExtractor;


public class OperatePdf {
static final String PATH = "I:/Test/test data/ICG Tier 1 Limit Report - 20170814 (3).pdf";
public static void main(String[] args) {
pdfReadExample(PATH);
// pdfboxReadExample(PATH);
}


public static void pdfReadExample(String path) {
try {
PdfReader reader = new PdfReader(path);


// String textFromPage = PdfTextExtractor.getTextFromPage(reader,
// 2);
String textFromPage = PdfTextExtractor.getTextFromPage(reader, 4, new MyLocationTextExtractionStrategy());


System.out.println(textFromPage);


reader.close();


} catch (IOException e) {
e.printStackTrace();
}
}


public static void pdfboxReadExample(String path) {
String result = null;
RandomAccessFile is = null;
PDDocument document = null;
try {
is = new RandomAccessFile(new File(path), "r");
PDFParser parser = new PDFParser(is);
parser.parse();
document = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(document);
System.out.println(result);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (document != null) {
try {
document.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值