PDDocument pdf = PDDocument.load(new File(srcFilePath));
PDPageTree pageTree = pdf.getPages();
int count = pageTree.getCount();
returnList = new String[count];
PDFTextStripper stripper = new PDFTextStripper() {
@Override
protected void startPage(PDPage page) throws IOException {
startOfLine = true;
super.startPage(page);
}
@Override
protected void writeLineSeparator() throws IOException {
startOfLine = true;
super.writeLineSeparator();
}
@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException {
if (startOfLine) {
TextPosition firstProsition = textPositions.get(0);
writeString(String.format("[%s,%s]", firstProsition.getXDirAdj(), firstProsition.getYDirAdj()));
startOfLine = false;
}
super.writeString(text, textPositions);
}
boolean startOfLine = true;
};
stripper.setSortByPosition(false);
stripper.setStartPage(1);
stripper.setEndPage(count);
String content = stripper.getText(pdf);
使用PDFBOX提取PDF中的文字
于 2022-06-21 12:32:16 首次发布