1、引入maven依赖
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.17</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.17</version>
</dependency>
2、代码:
package com.example.test.pdf.pdfbox;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.List;import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;import lombok.extern.slf4j.Slf4j;
@Slf4j
public class PdfText {public static void main(String[] args) throws FileNotFoundException {
InputStream fileStream = new FileInputStream("C:\\Users\\Administrator\\Desktop\\temp\\PO-0006235643 1-12.pdf");
String content = getTextFromPdf(fileStream, true);
System.out.println(content);
}
/**
* 获取pdf中的文字
* @param pdfFile
* @param sort 是否有序
* @return
* @throws Exception
*/
public static String getTextFromPdf(InputStream fileStream, boolean sort) {
int startPage = 1;
String content = null;
PDDocument document = null;
try {
// 加载 pdf 文档
document = PDDocument.load(fileStream);
int endPage = null == document ? Integer.MAX_VALUE : document.getNumberOfPages();
PDFTextStripper stripper = new PDFTextStripper()
{
@Override
protected void startPage(PDPage page) throws IOException
{
startOfLine = true;
super.startPage(page);
}@Override
protected void writeLineSeparator() throws IOException
{
startOfLine = true;
super.writeLineSeparator();
}@Override
protected void writeString(String text, List<TextPosition> textPositions) throws IOException
{
if (startOfLine)
{
TextPosition firstProsition = textPositions.get(0);
writeString(String.format("[%s,%s]", firstProsition.getXDirAdj(), firstProsition.getYDirAdj()));
startOfLine = false;
}
super.writeString(text, textPositions);
}
boolean startOfLine = true;
};
stripper.setSortByPosition(sort);
stripper.setStartPage(startPage);
stripper.setEndPage(endPage);
content = stripper.getText(document);
log.info(content);
} catch (Exception e) {
log.error(e.getMessage(), e);
}
return content;}
}
输出内容
[284.7,44.700012]测试内容
[290.7,53.099976]7 Times Square
[289.55,61.5]文字内容
[271.75,69.900024]文字内容文字内容
[455.05,81.650024]文字内容文字内容
[455.05,93.0]文字内容文字内容