<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>cn.hutool</groupId>
<artifactId>hutool-all</artifactId>
<version>5.8.16</version>
</dependency>
import cn.hutool.core.io.FileUtil;
import cn.hutool.core.util.ObjectUtil;
import cn.hutool.http.HttpUtil;
import lombok.extern.slf4j.Slf4j;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
@Slf4j
public class Word2ListUtil {
public static List<String> word2List(String filePath) throws IOException {
XWPFWordExtractor extractor = null;
FileInputStream fis = null;
InputStream inputStream = null;
File file = null;
try {
inputStream = HttpUtil.createGet(filePath).execute().bodyStream();
File tempFile = File.createTempFile("tmp", ".doc");
file = FileUtil.writeFromStream(inputStream, tempFile);
log.info("临时文件所在路径: {}", file);
fis = new FileInputStream(file);
XWPFDocument document = new XWPFDocument(fis);
extractor = new XWPFWordExtractor(document);
String text = extractor.getText().replaceAll(":", ":")
.replaceAll("。", ".")
.replaceAll(" ", "")
.replaceAll(";", ";")
.replaceAll(",", ",");
String[] textArray = text.split("\n");
return Arrays.stream(textArray).filter(str -> !str.trim().isEmpty()).collect(Collectors.toList());
} catch (Exception e) {
log.error("word文档提取文字集合失败异常! {}", e.getMessage());
e.printStackTrace();
} finally {
if (ObjectUtil.isNotNull(extractor)) {
extractor.close();
}
if (ObjectUtil.isNotNull(fis)) {
fis.close();
}
if (ObjectUtil.isNotNull(inputStream)) {
inputStream.close();
}
if (ObjectUtil.isNotNull(file)) {
boolean deleteFlag = file.delete();
log.info("临时文件删除状态: {}", deleteFlag);
}
}
return null;
}
}