jar包:
<!-- 读取doc/docx/pdf -start-->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>
<version>2.1.4</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.17</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>ooxml-schemas</artifactId>
<version>1.3</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.15</version>
</dependency>
<!-- 读取doc/docx/pdf -end-->
工具类:
import com.alibaba.excel.EasyExcel;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import org.apache.pdfbox.io.RandomAccessBuffer;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
/**
* 读取word/pdf/excel
*/
public class ReadFile {
/**
* 简单粗暴
*
* @param file
* @return
*/
public static List<String> readDoc(File file) {
InputStream fis = null;
XWPFWordExtractor extractor = null;
WordExtractor ex = null;
List<String> list = new ArrayList<>();
PDDocument pdfdocument = null;
try {
String fileName = file.getName();
String extension = fileName.substring(file.getName().lastIndexOf(".")).toLowerCase();
fis = new FileInputStream(file);
if (".doc".equals(extension)) {
ex = new WordExtractor(fis);
list.addAll(Arrays.asList(ex.getText().split("\n")));
} else if (".docx".equals(extension)) {
XWPFDocument xdoc = new XWPFDocument(fis);
extractor = new XWPFWordExtractor(xdoc);
list.addAll(Arrays.asList(extractor.getText().split("\n")));
} else if (".pdf".equals(extension)) {
PDFParser parser = new PDFParser(new RandomAccessBuffer(fis)); // 创建PDF解析器
parser.parse(); // 执行PDF解析过程
pdfdocument = parser.getPDDocument(); // 获取解析器的PDF文档对象
PDFTextStripper pdfstripper = new PDFTextStripper(); // 生成PDF文档内容剥离器
list.addAll(Arrays.asList(pdfstripper.getText(pdfdocument).replace("\r", "").split("\n")));
} else if (".xlsx .xls".contains(extension)) {
List<Map> excelList = EasyExcel.read(file.getPath()).head(Map.class).sheet().doReadSync();
List<Map> excels = JSONArray.parseArray(JSONObject.toJSONString(excelList), Map.class);
list.add(JSON.toJSONString(excels));
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (null != fis) {
fis.close();
}
if (null != extractor) {
extractor.close();
}
if (null != ex) {
ex.close();
}
if (null != pdfdocument) {
pdfdocument.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
return list;
}
}