1、所需依赖jar包
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.itextpdf/itext-core -->
<dependency>
<groupId>com.itextpdf</groupId>
<artifactId>itext-core</artifactId>
<version>8.0.4</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
2、定义解析文件类型枚举类
public enum FileType {
DOCX("docx", 1),
DOC("doc", 2),
PDF("pdf", 3),
TXT("txt", 4),
XLS("xls", 5),
XLSX("xlsx", 6),
PPT("ppt", 7),
PPTX("pptx", 8),
;
private final String desc;
private final int key;
FileType(String d, int k) {
desc = d;
key = k;
}
public String getDesc() {
return desc;
}
}
3、解析文件
public String getFileContent() throws IOException {
File file = new File("D:\\test.txt");
String ext = "txt";
InputStream inputStream = new FileInputStream(file);
return asyncExecutorService.getFileInfoContent(inputStream,ext);
}
/**
* 获取小文件内容
* @param ext
* @return
* @throws IOException
*/
@Override
public String getFileInfoContent(InputStream inputStream, String ext) throws IOException {
String result = "";
if (ext.equals(FileType.DOCX.getDesc())){
result = handlerDocx(inputStream);
} else if (ext.equals(FileType.DOC.getDesc())) {
HWPFDocument document = new HWPFDocument(inputStream);
result = document.getText().toString();
} else if (ext.equals(FileType.TXT.getDesc())) {
BufferedReader reader = new BufferedReader(new InputStreamReader(inputStream));
String line;
StringBuilder stringBuilder = new StringBuilder();
while ((line = reader.readLine()) != null) {
stringBuilder.append(line.trim() + "\n");
}
result = stringBuilder.toString();
} else if (ext.equals(FileType.XLS.getDesc())) {
try (Workbook workbook = new HSSFWorkbook(inputStream)) {
result = handlerExcel(workbook);
}
} else if (ext.equals(FileType.XLSX.getDesc())) {
try (Workbook workbook = new XSSFWorkbook(inputStream)) {
result = handlerExcel(workbook);
}
} else if (ext.equals(FileType.PDF.getDesc())) {
try (PdfReader pdfReader = new PdfReader(inputStream);
PdfDocument pdfDocument = new PdfDocument(pdfReader)) {
// 使用PdfTextExtractor提取文本
result = PdfTextExtractor.getTextFromPage(pdfDocument.getFirstPage());
}
} else if (ext.equals(FileType.PPT.getDesc())) {
StringBuffer content = new StringBuffer();
HSLFSlideShow hslfSlideShow = new HSLFSlideShow(inputStream);
List<HSLFSlide> slides = hslfSlideShow.getSlides();
SlideShowExtractor slideShowExtractor = new SlideShowExtractor(hslfSlideShow);
for (HSLFSlide slide : slides) {
content.append(slideShowExtractor.getText(slide));
}
slideShowExtractor.close();
result = content.toString();
} else if (ext.equals(FileType.PPTX.getDesc())) {
result = handlerPPTX(inputStream);
}
inputStream.close();
return result;
}
/**
* 获取docx类型文件内容
* @param inputStream
*/
private String handlerDocx(InputStream inputStream) throws IOException {
XWPFDocument doc = new XWPFDocument(inputStream);
//获取段落
StringBuilder result = new StringBuilder();
for (IBodyElement element : doc.getBodyElements()) {
//文本类型
if (element instanceof XWPFParagraph){
XWPFParagraph paragraph = (XWPFParagraph) element;
String text = getText(paragraph);
result.append(text + "\n");
} else if (element instanceof XWPFTable) {
XWPFTable table = (XWPFTable) element;
String text = table.getText();
result.append(text + "\n");
}
}
return result.toString();
}
private static String getText(XWPFParagraph paragraph) {
//获取行
List<XWPFRun> runs = paragraph.getRuns();
//获取行中所有文字
StringBuilder stringBuilder = new StringBuilder();
for (XWPFRun run : runs) {
stringBuilder.append(run.text().trim());
}
return stringBuilder.toString();
}
/**
* 获取excel类型文件内容
* @param workbook
* @return
*/
private String handlerExcel(Workbook workbook){
Sheet sheet = workbook.getSheetAt(0);
Iterator<Row> rowIterator = sheet.iterator();
StringBuffer result = new StringBuffer();
while (rowIterator.hasNext()) {
Row row = rowIterator.next();
Iterator<Cell> cellIterator = row.iterator();
while (cellIterator.hasNext()) {
Cell cell = cellIterator.next();
String text = "";
switch (cell.getCellType()) {
case STRING:
text = cell.getStringCellValue() + "\t";
break;
case NUMERIC:
if (DateUtil.isCellDateFormatted(cell)) {
Date date = cell.getDateCellValue();
SimpleDateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
text = formatter.format(date) + "\t";
} else {
DataFormatter dataFormatter = new DataFormatter();
text = dataFormatter.formatCellValue(cell) + "\t";
}
break;
case BOOLEAN:
text = cell.getBooleanCellValue() + "\t";
break;
default:
System.out.print("\t");
}
result.append(text);
}
result.append("\n");
}
return result.toString();
}
/**
* 处理PPTX类型文件
* @param inputStream
* @return
*/
private String handlerPPTX(InputStream inputStream) {
StringBuffer content = new StringBuffer();
try {
XMLSlideShow xmlSlideShow = new XMLSlideShow(inputStream);
List<XSLFSlide> slides = xmlSlideShow.getSlides();
//遍历幻灯片
for (XSLFSlide slide : slides) {
CTSlide rawSlide = slide.getXmlObject();
CTGroupShape spTree = rawSlide.getCSld().getSpTree();
List<CTShape> spList = spTree.getSpList();
for (CTShape shape : spList) {
CTTextBody txBody = shape.getTxBody();
if (null == txBody) {
continue;
}
List<CTTextParagraph> pList = txBody.getPList();
for (CTTextParagraph textParagraph : pList) {
List<CTRegularTextRun> textRuns = textParagraph.getRList();
for (CTRegularTextRun textRun : textRuns) {
content.append(textRun.getT() + "\n");
}
}
}
}
xmlSlideShow.close();
} catch (Exception e) {
e.printStackTrace();
}
return content.toString();
}