1. 包引用
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.24</version>
</dependency>
2 Demo
package pdf;
import java.io.File;
import java.io.FileOutputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.util.Scanner;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
public class Pdf2Txt {
public static String base_path = "";
public static void main(String[] args) {
Scanner sc = new Scanner(System.in);
System.out.println("请输入pdf文件夹所在的目录:");
String path = sc.nextLine();
base_path = path + "/";
sc.close();
getFiles();
}
public static void getFiles() {
File file = new File(base_path+"pdfs");
File[] fileArray = file.listFiles();
if(fileArray == null || fileArray.length == 0) {
System.out.print("未查找到pdf文件,请检查目录是否正确...");
System.exit(0);
return;
}
for(int i = 0; i < fileArray.length; i++) {
File f = fileArray[i];
String fileName = f.getName();
getTxt(fileName);
}
}
public static void getTxt(String name) {
try {
PDDocument doc = PDDocument.load(new File(base_path + "pdfs/" +name));
if(doc.isEncrypted()) {
System.out.println(name + "文档被加密,无法解析....");
return;
}
PDFTextStripper stripper = new PDFTextStripper();
writedText(stripper.getText(doc), name);
} catch(Exception e){
e.printStackTrace();
}
}
public static void writedText(String result, String name) {
String fileNameWithoutExtension = name.substring(0, name.lastIndexOf('.'));
try {
String path = base_path + "txts/" + fileNameWithoutExtension + ".txt";
File file = new File(path);
if(!file.exists()) {
file.createNewFile();
}
OutputStream os = new FileOutputStream(file);
OutputStreamWriter writer = new OutputStreamWriter(os);
writer.write(result);
writer.close();
os.close();
System.out.println(name + " : 提取完成...");
} catch (Exception e) {
e.printStackTrace();
}
}
}