POI读取doc,docx文档内容
1.pom依赖:
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
2.由于人工创建的文档各式各样,有可能doc文件另存为docx,docx另存为doc,虽然文件后缀变了,但是其原有的格式并未改变,所以需要做兼容处理。见代码:
@Slf4j
public class FileUtils {
public static String getDocxWord(MultipartFile multipartFile) {
String docWord = null;
try {
XWPFDocument read = read(multipartFile);
if (read==null){
return null;
}
XWPFWordExtractor docx = new XWPFWordExtractor(read);
docWord = docx.getText().trim();
log.info("解析doc文档成功:{}", docWord);
docx.close();
} catch (NullPointerException e) {
return null;
}catch (IOException e){
e.printStackTrace();
}
return docWord;
}
public static String getDocWord( MultipartFile multipartFile) {
String docxWord = "";
WordExtractor wordExtractor = null;
InputStream is =null;
try {
is = multipartFile.getInputStream();
try {
wordExtractor = new WordExtractor(is);
docxWord = wordExtractor.getText().trim();
log.info("解析docx文档成功:{}", docxWord);
} catch (IllegalArgumentException e) {
docxWord=getDocxWord(multipartFile);
}finally {
if (wordExtractor !=null){
wordExtractor.close();
}
if (is!=null){
is.close();
}
}
} catch (IOException e) {
log.error("解析docx文档失败", e);
}
return docxWord;
}
public static XWPFDocument read(MultipartFile multipartFile) {
InputStream is = null;
if (!multipartFile.isEmpty()) {
try {
is = multipartFile.getInputStream();
XWPFDocument document = null;
try {
document = new XWPFDocument(is);
} catch (NotOfficeXmlFileException e) {
document=null;
}
return document;
} catch (Exception e) {
e.printStackTrace();
}
}
return null;
}
public static void main(String[] args) throws Exception {
File file = new File("C:\\Users\\dcw\\Desktop\\2.doc");
MultipartFile multipartFile = fileToMultipartFile(file);
System.out.println(getDocWord(multipartFile));
}
}