读取word文档表格解析工具类

最新推荐文章于 2024-07-19 17:37:28 发布
楠风_lx
最新推荐文章于 2024-07-19 17:37:28 发布
阅读量269
点赞数
分类专栏：工具文章标签： java
本文链接：https://blog.csdn.net/abc_lx123/article/details/118487655
版权
工具专栏收录该内容
2 篇文章 0 订阅
订阅专栏
import com.cmft.fhris.app.basic.exception.BusinessException;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.*;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.usermodel.*;

import java.io.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import static com.cmft.fhris.app.business.licence.constant.LicneceErrorMsgConstant.LICENCE_IO_ERROR_LOG_MSG;
import static com.cmft.fhris.app.business.licence.constant.LicneceErrorMsgConstant.READ_WORD_ERROR_MSG;

/**
 * 读取word文档工具类
 */
@Slf4j
public class ReadWordUtil {
    
    /**
     * 根据输入流读取word文件 将表格数据转List数据结构
     *
     * @param in
     * @param wordType
     * @return
     */
    public static List<String[]> getWordTableInputStream(FileInputStream in, String filePath, Integer wordType) {
        List<String[]> tabRowList = new ArrayList(); // 表格行集合
        try {
            int tableCount = 1;
            // 处理docx格式 即office2007以后版本
            if (filePath.toLowerCase().endsWith("docx")) {
                //word 2007 图片不会被读取， 表格中的数据会被放在字符串的最后
                //得到word文档的信息
                XWPFDocument xwpf = new XWPFDocument(in);
                //得到word中的表格
                Iterator<XWPFTable> it = xwpf.getTablesIterator();
                while (it.hasNext()) {
                    //只读取第一个表格数据
                    if (1 == tableCount) {
                        XWPFTable table = it.next();
                        List<XWPFTableRow> rows = table.getRows();
                        //读取每一行数据
                        for (int i = 0; i < rows.size(); i++) {
                            XWPFTableRow row = rows.get(i);
                            //读取每一列数据
                            List<XWPFTableCell> cells = row.getTableCells();
                            //当前行列数
                            int numCells = cells.size();
                            String[] cellArr = new String[numCells];
                            for (int j = 0; j < numCells; j++) {
                                XWPFTableCell cell = cells.get(j);
                                if (2 == wordType && i == 0 && j == 6) {
                                    cellArr[j] = getCellImageByDocs(cell);
                                } else {
                                    //取得单元格的内容
                                    String cellContent = "";
                                    List<XWPFParagraph> paragraphs = cell.getParagraphs();
                                    for (int k = 0; k < paragraphs.size(); k++) {
                                        XWPFParagraph para = paragraphs.get(k);
                                        cellContent = StringUtils.isBlank(cellContent) ? para.getText().trim() : cellContent + para.getText().trim();
                                    }
                                    //去除字符串中的空格、回车、换行符、制表符
                                    Pattern p = Pattern.compile("\\s*|\t|\r|\n");
                                    Matcher m = p.matcher(cellContent);
                                    cellArr[j] = m.replaceAll("");
                                }
                            }
                            tabRowList.add(cellArr);
                        }
                    }
                    tableCount++;
                }
            } else {
                // 处理doc格式 即office2003版本
                POIFSFileSystem pfs = new POIFSFileSystem(in);
                //获取文档操作对象
                HWPFDocument hwpf = new HWPFDocument(pfs);
                //获取文档操作区间
                Range range = hwpf.getRange();
                //获取表格迭代器
                TableIterator it = new TableIterator(range);
                while (it.hasNext()) {
                    //只读取第一个表格数据
                    if (1 == tableCount) {
                        //获取word中表格
                        Table tb = it.next();
                        for (int i = 0; i < tb.numRows(); i++) {
                            //获取行数
                            TableRow tr = tb.getRow(i);
                            //当前行列数
                            int numCells = tr.numCells();
                            String[] cellArr = new String[tr.numCells()];
                            for (int j = 0; j < numCells; j++) {
                                TableCell tc = tr.getCell(j);
                                if (2 == wordType && i == 0 && j == 6) {
                                    PicturesTable picturesTable = hwpf.getPicturesTable();
                                    cellArr[j] = getCellImageByDoc(tc, picturesTable);
                                } else {
                                    //取得单元格的内容
                                    String cellContent = "";
                                    for (int k = 0; k < tc.numParagraphs(); k++) {
                                        Paragraph para = tc.getParagraph(k);
                                        cellContent = StringUtils.isBlank(cellContent) ? para.text().trim() : cellContent + para.text().trim();
                                    }
                                    //去除字符串中的空格、回车、换行符、制表符
                                    Pattern p = Pattern.compile("\\s*|\t|\r|\n");
                                    Matcher m = p.matcher(cellContent);
                                    cellArr[j] = m.replaceAll("");
                                }
                            }
                            tabRowList.add(cellArr);
                        }
                    }
                    tableCount++;
                }
            }
        } catch (Exception e) {
            log.error(LICENCE_IO_ERROR_LOG_MSG, e.getMessage(), e);
            throw new BusinessException(READ_WORD_ERROR_MSG);
        } finally {
            try {
                in.close();
            } catch (IOException e) {
                log.error(LICENCE_IO_ERROR_LOG_MSG, e.getMessage(), e);
                throw new BusinessException(READ_WORD_ERROR_MSG);
            }
        }
        return tabRowList;
    }

    /**
     * 功能描述:获取单元格中的图片数据
     */
    public static String getCellImageByDoc(TableCell cell, PicturesTable picturesTable) {
        CharacterRun cr = cell.getCharacterRun(0);
        if (picturesTable.hasPicture(cr)) {
            Picture pic = picturesTable.extractPicture(cr, true);
            byte[] picbyte = pic.getContent();
            String fileName = UUID.randomUUID().toString() + ".jpg";
            String tmpDir = System.getProperty("java.io.tmpdir");
            String filePath = tmpDir + fileName;
            File file = byte2File(picbyte, filePath);
            return file.getAbsolutePath();
        }
        return "";
    }

    /**
     * 功能描述:获取单元格中的图片数据
     */
    public static String getCellImageByDocs(XWPFTableCell cell) {
        List<XWPFParagraph> xwpfParagraphs = cell.getParagraphs();
        if (xwpfParagraphs == null) {
            return null;
        }
        for (XWPFParagraph xwpfParagraph : xwpfParagraphs) {
            List<XWPFRun> xwpfRunList = xwpfParagraph.getRuns();
            if (xwpfRunList == null) {
                return null;
            }
            for (XWPFRun xwpfRun : xwpfRunList) {
                List<XWPFPicture> xwpfPictureList = xwpfRun.getEmbeddedPictures();
                if (xwpfPictureList == null) {
                    return null;
                }
                for (XWPFPicture xwpfPicture : xwpfPictureList) {
                    String fileName = UUID.randomUUID().toString() + ".jpg";
                    String tmpDir = System.getProperty("java.io.tmpdir");
                    String filePath = tmpDir + fileName;
                    File file = byte2File(xwpfPicture.getPictureData().getData(), filePath);
                    return file.getAbsolutePath();
                }
            }
        }
        return "";
    }

    /**
     * 根据byte数组，生成文件
     *
     * @param bfile    文件数组
     * @param filePath 文件存放路径
     */
    public static File byte2File(byte[] bfile, String filePath) {
        BufferedOutputStream bos = null;
        FileOutputStream fos = null;
        File file = null;
        try {
            file = new File(filePath);
            fos = new FileOutputStream(file);
            bos = new BufferedOutputStream(fos);
            bos.write(bfile);
            return file;
        } catch (Exception e) {
            log.error(LICENCE_IO_ERROR_LOG_MSG, e.getMessage(), e);
            throw new BusinessException(READ_WORD_ERROR_MSG);
        } finally {
            try {
                if (bos != null) {
                    bos.close();
                }
                if (fos != null) {
                    fos.close();
                }
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
    }

}
楠风_lx
关注
0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
读取word文档表格解析工具类

import com.cmft.fhris.app.basic.exception.BusinessException;import lombok.extern.slf4j.Slf4j;import org.apache.commons.lang3.StringUtils;import org.apache.poi.hwpf.HWPFDocument;import org.apache.poi.hwpf.model.PicturesTable;import org.apache.poi.hwpf.
复制链接

扫一扫