Java后台系统中常常会遇到读取文档内容的需求,今天把 Java 读取两种格式的word文档写了一个简单的工具类附上:
1. 需要添加 Apache.poi 的依赖
<!-- apache poi-->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>3.8</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
2. 工具类代码
package com.lq.file.word;
/**
* <p>Description:POIUtil 工具类</p>
* <p>Copyright: Copyright (c)2019</p>
* <p>Company: Tope</p>
* <P>@version 1.0</P>
*/
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
public class POIUtil {
/**
* @Description: POI 读取 word
* @create: 2019-07-27 9:48
* @update logs
* @throws Exception
*/
public static List<String> readWord(String filePath) throws Exception{
List<String> linList = new ArrayList<String>();
String buffer = "";
try {
if (filePath.endsWith(".doc")) {
InputStream is = new FileInputStream(new File(filePath));
WordExtractor ex = new WordExtractor(is);
buffer = ex.getText();
ex.close();
if(buffer.length() > 0){
//使用回车换行符分割字符串
String [] arry = buffer.split("\\r\\n");
for (String string : arry) {
linList.add(string.trim());
}
}
} else if (filePath.endsWith(".docx")) {
OPCPackage opcPackage = POIXMLDocument.openPackage(filePath);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
buffer = extractor.getText();
extractor.close();
if(buffer.length() > 0){
//使用换行符分割字符串
String [] arry = buffer.split("\\n");
for (String string : arry) {
linList.add(string.trim());
}
}
} else {
return null;
}
return linList;
} catch (Exception e) {
System.out.print("error---->"+filePath);
e.printStackTrace();
return null;
}
}
}