1、pom.xml
<dependency> <groupId>com.aspose</groupId> <artifactId>aspose-words</artifactId> <version>15.8.0</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.14.3</version> </dependency>
2、代码
package com.nancal.plant.controller; import com.aspose.words.Document; import com.aspose.words.SaveFormat; import com.nancal.plant.dto.WordDTO; import io.swagger.annotations.Api; import io.swagger.annotations.ApiOperation; import lombok.extern.slf4j.Slf4j; import org.apache.commons.collections4.CollectionUtils; import org.apache.commons.io.IOUtils; import org.apache.commons.lang3.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.springframework.web.bind.annotation.*; import org.springframework.web.multipart.MultipartFile; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.util.*; @RestController @RequestMapping("/dme/api/v1") @Api(tags = "文件的控制器") @Slf4j public class FileController { /** * word文档解析接口 * * @param multipartFile * @return * @throws Exception */ @ApiOperation(value = "word文档解析接口") @PostMapping(value = "/word/upload") public List<WordDTO> commonDmeRequest(@RequestParam("file") MultipartFile multipartFile) throws Exception { InputStream inputStream = multipartFile.getInputStream(); Document document = new Document(inputStream); String tmp = System.getProperty("java.io.tmpdir"); String filename = multipartFile.getOriginalFilename(); if (StringUtils.isNotBlank(filename)) { filename = filename.substring(0, filename.lastIndexOf(".")); } String htmlFilePath = tmp + filename +".html"; document.save(htmlFilePath, SaveFormat.HTML); String htmlStr = getHtmlStrFromFile(htmlFilePath); org.jsoup.nodes.Document htmlDoc = Jsoup.parse(htmlStr); // 解析word List<WordDTO> wordDTOList = parseWord(htmlDoc); return wordDTOList; } /** * 解析word * * @param htmlDoc * @return * @throws Exception */ private List<WordDTO> parseWord(org.jsoup.nodes.Document htmlDoc) throws Exception { Elements elementList = htmlDoc.getElementsByTag("h1"); if (CollectionUtils.isEmpty(elementList)) { throw new Exception("不存在一级标题"); } List<WordDTO> wordDTOList = new ArrayList<>(); for (Element element : elementList) { WordDTO wordDTO = new WordDTO(); parseElement(element, wordDTO); wordDTOList.add(wordDTO); } return wordDTOList; } /** * 递归解析单个h1整个结构 * * @param element * @param wordDTO */ private void parseElement(Element element, WordDTO wordDTO) { wordDTO.setTitle(element.toString()); Element nextElement = element.nextElementSibling(); String tagName = nextElement.tagName(); if (!Objects.equals("p", tagName)) { WordDTO word = new WordDTO(); wordDTO.setChild(word); parseElement(nextElement, word); } else { wordDTO.setContent(nextElement.toString()); } } private static String getHtmlStrFromFile(String filePath) throws IOException { FileInputStream in = null; File file = new File(filePath); Long filelength = file.length(); byte[] filecontent = new byte[filelength.intValue()]; try { in = new FileInputStream(file); in.read(filecontent); } catch (IOException e) { e.printStackTrace(); } finally { IOUtils.closeQuietly(in); } return new String(filecontent, "UTF-8"); } }
3、WordDTO
package com.nancal.plant.dto; import lombok.AllArgsConstructor; import lombok.Data; import lombok.NoArgsConstructor; /** * wordDTO * * @since 2023-07-19 * @author zhouwb * */ @NoArgsConstructor @AllArgsConstructor @Data public class WordDTO { // 标题 private String title; // 子节点 private WordDTO child; // 内容 private String content; }