Java读取PDF文件中字符串
方法:PDFBox
1、加入依赖
<!-- https://mvnrepository.com/artifact/pdfbox/pdfbox -->
<dependency>
<groupId>pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>0.7.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/FontBox/FontBox -->
<dependency>
<groupId>FontBox</groupId>
<artifactId>FontBox</artifactId>
<version>0.1.0-dev</version>
</dependency>
2、测试代码
public static String getTextFromPDF(String pdfFilePath)
{
String result = null;
FileInputStream is = null;
PDDocument document = null;
try {
is = new FileInputStream(pdfFilePath);
PDFParser parser = new PDFParser(is);
parser.parse();
document = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(document);
} catch (FileNotFoundException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
if (document != null) {
try {
document.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
return result;
}
public static void main(String[] args)
{
String str=getTextFromPDF("D:\\XXX.pdf");
System.out.println(str);
}