提取doc文本只需要下面的依赖
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.2</version>
</dependency>
提取docx文本还需要添加poi-ooxml依赖
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.2</version>
</dependency>
文档格式类型不能仅仅根据文件后缀判断,需要结合文件的魔数进行判断,判断文档格式类型
if(FileMagic.valueOf(fis)==FileMagic.OLE2){
//doc格式 2003以前
} else if (FileMagic.valueOf(fis)==FileMagic.OOXML) {
//docx格式
}
提取doc文件内容
FileInputStream fis1 = new FileInputStream("D:\\temp\\c0f1c520595b8dde990efdbfbb8ca3eb.docx");
WordExtractor wordExtractor1 = new WordExtractor(fis1);
System.out.println(wordExtractor1.getText());
}
提取docx文件内容
FileInputStream fileInputStream= new FileInputStream("D:\\temp\\c0f1c520595b8dde990efdbfbb8ca3eb.docx");
XWPFWordExtractor docx = new XWPFWordExtractor(new XWPFDocument(fileInputStream));
String text = docx.getText();
System.out.println(text);