需要先引入对应jar包
<!-- 获取pdf文件内容 -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.26</version>
</dependency>
<!-- excel工具 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
</dependency>
import org.apache.pdfbox.io.RandomAccessBufferedFileInputStream;
import org.apache.pdfbox.io.RandomAccessRead;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.*;
public class Test {
public static void main(String[] args) {
File docx = new File("F:/Java/docx.docx");
File pdf = new File("F:/Java/pdf.pdf");
String docxContent = readDocxContent(docx);
String pdfContent = readPdfContent(pdf);
System.out.println("docx="+docxContent);
System.out.println("pdf="+pdfContent);
}
/**
* 读取docx文件内容
* @param file
* @return
*/
public static String readDocxContent(File file){
FileInputStream inputStream = null;
String content = "";
try {
inputStream = new FileInputStream(file);
XWPFDocument xdoc = new XWPFDocument(inputStream);
XWPFWordExtractor extractor = new XWPFWordExtractor(xdoc);
content = extractor.getText();
extractor.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
return content;
}
/**
* 读取pdf文件内容
* @param file
* @return
*/
public static String readPdfContent(File file){
FileInputStream inputStream = null;
String content = "";
try {
inputStream = new FileInputStream(file);
RandomAccessRead randomAccessRead = new RandomAccessBufferedFileInputStream(inputStream);
PDFParser parser = new PDFParser(randomAccessRead);
parser.parse();
PDDocument pdDocument = parser.getPDDocument();
PDFTextStripper stripper = new PDFTextStripper();
content = stripper.getText(pdDocument);
randomAccessRead.close();
pdDocument.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (Exception e) {
e.printStackTrace();
}
return content;
}
}