我正在使用PDFbox来提取PDF文档中的单词/字符串的坐标,并且迄今已经成功地确定了单个字符的位置.这是迄今为止的代码,从PDFbox doc:
package printtextlocations;
import java.io.*;
import org.apache.pdfbox.exceptions.InvalidPasswordException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.common.PDStream;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.pdfbox.util.TextPosition;
import java.io.IOException;
import java.util.List;
public class PrintTextLocations extends PDFTextStripper {
public PrintTextLocations() throws IOException {
super.setSortByPosition(true);
}
public static void main(String[] args) throws Exception {
PDDocument document = null;
try {
File input = new File("C:\\path\\to\\PDF.pdf");