<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>1.8.13</version>
</dependency>
java代码:
/**
*
* @Title: getTextFromPdf
* @Description: 读取pdf文件内容
* @param filePath
* @return: 读出的pdf的内容
*/
public static String getTextFromPdf(String filePath) {
String result = null;
FileInputStream is = null;
PDDocument document = null;
try {
is = new FileInputStream(filePath);
PDFParser parser = new PDFParser(is);
parser.parse();
document = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
result = stripper.getText(document);
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (document != null) {
try {
document.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
return result; //返回pdf所有文字内容
}
excel,word解析见java解析excel,word,pdf