导入PDFbox的jar包,我是通过maven,fontbox是支持中文的字体
pom.xml
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.4</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.8</version>
</dependency>
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.TextPosition;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public static void getTextFromPdf() throws IOException {
String dest = "D:/test.pdf";
PDDocument document = PDDocument.load(new File(dest));
// 获取页码
int pages = document.getNumberOfPages();
System.out.println("总页:" + pages);
PDFTextStripper2 stripper2 = new PDFTextStripper2();
for (int i = 1; i <= pages; i++) {
stripper2.setStartPage(i);
stripper2.setEndPage(i);
stripper2.getText(document);//读取当前页的全部内容
//这里可以自己for循环处理,我为了过滤PDF page的头和页脚,直接读body
//注意,body里面遇到表格,图片,会放到ls的最后面,方便处理
List<List<TextPosition>> ll = stripper2.getCharactersByArticle();
// List<TextPosition> ls = ll.get(0);//读取PDF page的header
List<TextPosition> ls = ll.get(1);//读取pdf page的body内容
// List<TextPosition> ls = ll.get(2);//读取PDF page的页码部分
float y = 0;
int buttom;//每行距离下面一行的距离
StringBuffer sentence = new StringBuffer();
for (TextPosition tp : ls) {
String c = tp.getUnicode();
//根据高度来判断是否是一句话
if (y != tp.getY()) {
System.out.print(sentence.toString());
buttom = (int) (tp.getY() - y);
if (buttom > 11 || buttom < -10) {
System.out.println();
}
y = tp.getY();
sentence.setLength(0);
}
sentence.append(c);
//特殊处理符号
if (c.equals("•")) {
sentence.append(" ");
}
//遇到表格不打印出来
if (sentence.toString().indexOf("表格 ") == 0 || sentence.toString().indexOf("Table ") == 0) {
break;
}
}
if (sentence.length() > 0) {
System.out.print(sentence.toString());
}
}
}
PDFTextStripper2.java
//这个类是为了读取到TextPosition的数据
public class PDFTextStripper2 extends PDFTextStripper {
public PDFTextStripper2() throws IOException {
}
@Override
public List<List<TextPosition>> getCharactersByArticle() {
return super.getCharactersByArticle();
}
}