package com.jeesite.modules.common.utils;
import com.jeesite.modules.common.io.FileUtils;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Picture;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFPictureData;
import org.springframework.web.multipart.MultipartFile;
import org.apache.poi.openxml4j.opc.OPCPackage;
import java.io.*;
import java.util.List;
public class ReadWordUtils {
// public static String tmpPath="E://tmp_files/tmp.pdf";
public static StringBuffer readWord(String fileType , MultipartFile file){
StringBuffer content = new StringBuffer("");
InputStream is = null;
if (".doc".equals(fileType)) {
try {
is =file.getInputStream();
WordExtractor extractor = new WordExtractor(is);
String[] paragraphText = extractor.getParagraphText(); // 获取段落,段落缩进无法获取,可以在前添加空格填充
if (paragraphText != null && paragraphText.length > 0) {
for (String paragraph : paragraphText) {
if (!paragraph.startsWith(" ")) {
content.append(" ").append(paragraph.trim()).append("\r\n");
} else {
content.append(paragraph);
}
}
}
} catch (Exception e) {
System.out.println("-----------解析异常-------------
解析word PDF文件内容
最新推荐文章于 2024-08-21 17:58:16 发布
本文将探讨如何利用Java编程语言来解析Word和PDF文档的内容,包括读取、提取文本信息等关键步骤,帮助开发者更好地理解和操作这两种常用文档格式。
摘要由CSDN通过智能技术生成