Java POI解析带图片的word文档
添加依赖
<!-- word文档解析依赖 -->
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.11.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>5.2.3</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.3</version>
</dependency>
待解析的word文档
直接上代码
public class WordParseServiceImpl{
@Autowire
private FileService fileService;
/**
* word文档解析
*/
public Boolean wordParse() {
File file = new File("xxxxx/word.docx");
try (InputStream fis = new FileInputStream(file)) {
XWPFDocument document = new XWPFDocument(fis);
List<XWPFParagraph> paragraphs = document.getParagraphs();
// 解析文档,获取一级标题及一级标题下的内容
Map<String, List<XWPFParagraph>> documentStructure = parseHeadDocument(paragraphs, "1");
// 遍历一级标题
for (Map.Entry<String, List<XWPFParagraph>> h1Entry : documentStructure.entrySet()) {
// 一级标题文本
System.err.println(h1Entry.getKey());
List<XWPFParagraph> h1Paragraphs = h1Entry.getValue();
// 解析文档,获取二级标题及二级标题下的内容
Map<String, List<XWPFParagraph>> h2Map = parseHeadDocument(h1Paragraphs, "2");
for (Map.Entry<String, List<XWPFParagraph>> h2Entry : h2Map.entrySet()) {
// 二级标题文本
System.err.println(h2Entry.getKey());
List<XWPFParagraph> list = h2Entry.getValue();
// 遍历二级标题
for (XWPFParagraph xwpfParagraph : list) {
// 包含图片,对图片做处理
String images = containsImages(xwpfParagraph);
if (StringUtils.isNotBlank(images)) {
// 图片的url
System.err.println(images);
} else if (StringUtils.isNotBlank(xwpfParagraph.getText())) {
// 二级标题下的正文
System.err.println(xwpfParagraph.getText());
}
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
/**
* 图片处理
*/
private String containsImages(XWPFParagraph xwpfParagraph) {
for (XWPFRun run : xwpfParagraph.getRuns()) {
List<XWPFPicture> pictures = run.getEmbeddedPictures();
for (XWPFPicture picture : pictures) {
XWPFPictureData pictureData = picture.getPictureData();
String fileName = pictureData.getFileName();
byte[] bytes = pictureData.getData();
String contentType = pictureData.getPictureTypeEnum().getContentType();
String name = fileName.substring(0, fileName.lastIndexOf("."));
MockMultipartFile file = new MockMultipartFile(name, fileName, contentType, bytes);
FileUploadDTO dto = new FileUploadDTO();
dto.setFile(file);
dto.setType(FileEntityType.test);
FileDTO upload = fileService.upload(dto);
return upload.getPreviewUrl();
}
}
return null;
}
/**
* 组装层级结构
*/
private Map<String, List<XWPFParagraph>> parseHeadDocument(List<XWPFParagraph> paragraphs, String heading) {
Map<String, List<XWPFParagraph>> map = new LinkedHashMap<>();
String title = "";
List<XWPFParagraph> list = new ArrayList<>();
for (XWPFParagraph paragraph : paragraphs) {
String style = paragraph.getStyle();
if (style != null && style.equals(heading)) {
if (StringUtils.isNotBlank(title) && !title.equals(paragraph.getText())) {
List<XWPFParagraph> titleList = new ArrayList<>(list);
map.put(title, titleList);
list = new ArrayList<>();
}
title = paragraph.getText();
} else {
list.add(paragraph);
}
}
if (StringUtils.isNotBlank(title)) {
map.put(title, list);
}
return map;
}
}
至此结束!