import com.cmft.fhris.app.basic.exception.BusinessException;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.lang3.StringUtils;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.PicturesTable;
import org.apache.poi.hwpf.usermodel.*;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.usermodel.*;
import java.io.*;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import static com.cmft.fhris.app.business.licence.constant.LicneceErrorMsgConstant.LICENCE_IO_ERROR_LOG_MSG;
import static com.cmft.fhris.app.business.licence.constant.LicneceErrorMsgConstant.READ_WORD_ERROR_MSG;
/**
* 读取word文档工具类
*/
@Slf4j
public class ReadWordUtil {
/**
* 根据输入流读取word文件 将表格数据转List数据结构
*
* @param in
* @param wordType
* @return
*/
public static List<String[]> getWordTableInputStream(FileInputStream in, String filePath, Integer wordType) {
List<String[]> tabRowList = new ArrayList(); // 表格行集合
try {
int tableCount = 1;
// 处理docx格式 即office2007以后版本
if (filePath.toLowerCase().endsWith("docx")) {
//word 2007 图片不会被读取, 表格中的数据会被放在字符串的最后
//得到word文档的信息
XWPFDocument xwpf = new XWPFDocument(in);
//得到word中的表格
Iterator<XWPFTable> it = xwpf.getTablesIterator();
while (it.hasNext()) {
//只读取第一个表格数据
if (1 == tableCount) {
XWPFTable table = it.next();
List<XWPFTableRow> rows = table.getRows();
//读取每一行数据
for (int i = 0; i < rows.size(); i++) {
XWPFTableRow row = rows.get(i);
//读取每一列数据
List<XWPFTableCell> cells = row.getTableCells();
//当前行列数
int numCells = cells.size();
String[] cellArr = new String[numCells];
for (int j = 0; j < numCells; j++) {
XWPFTableCell cell = cells.get(j);
if (2 == wordType && i == 0 && j == 6) {
cellArr[j] = getCellImageByDocs(cell);
} else {
//取得单元格的内容
String cellContent = "";
List<XWPFParagraph> paragraphs = cell.getParagraphs();
for (int k = 0; k < paragraphs.size(); k++) {
XWPFParagraph para = paragraphs.get(k);
cellContent = StringUtils.isBlank(cellContent) ? para.getText().trim() : cellContent + para.getText().trim();
}
//去除字符串中的空格、回车、换行符、制表符
Pattern p = Pattern.compile("\\s*|\t|\r|\n");
Matcher m = p.matcher(cellContent);
cellArr[j] = m.replaceAll("");
}
}
tabRowList.add(cellArr);
}
}
tableCount++;
}
} else {
// 处理doc格式 即office2003版本
POIFSFileSystem pfs = new POIFSFileSystem(in);
//获取文档操作对象
HWPFDocument hwpf = new HWPFDocument(pfs);
//获取文档操作区间
Range range = hwpf.getRange();
//获取表格迭代器
TableIterator it = new TableIterator(range);
while (it.hasNext()) {
//只读取第一个表格数据
if (1 == tableCount) {
//获取word中表格
Table tb = it.next();
for (int i = 0; i < tb.numRows(); i++) {
//获取行数
TableRow tr = tb.getRow(i);
//当前行列数
int numCells = tr.numCells();
String[] cellArr = new String[tr.numCells()];
for (int j = 0; j < numCells; j++) {
TableCell tc = tr.getCell(j);
if (2 == wordType && i == 0 && j == 6) {
PicturesTable picturesTable = hwpf.getPicturesTable();
cellArr[j] = getCellImageByDoc(tc, picturesTable);
} else {
//取得单元格的内容
String cellContent = "";
for (int k = 0; k < tc.numParagraphs(); k++) {
Paragraph para = tc.getParagraph(k);
cellContent = StringUtils.isBlank(cellContent) ? para.text().trim() : cellContent + para.text().trim();
}
//去除字符串中的空格、回车、换行符、制表符
Pattern p = Pattern.compile("\\s*|\t|\r|\n");
Matcher m = p.matcher(cellContent);
cellArr[j] = m.replaceAll("");
}
}
tabRowList.add(cellArr);
}
}
tableCount++;
}
}
} catch (Exception e) {
log.error(LICENCE_IO_ERROR_LOG_MSG, e.getMessage(), e);
throw new BusinessException(READ_WORD_ERROR_MSG);
} finally {
try {
in.close();
} catch (IOException e) {
log.error(LICENCE_IO_ERROR_LOG_MSG, e.getMessage(), e);
throw new BusinessException(READ_WORD_ERROR_MSG);
}
}
return tabRowList;
}
/**
* 功能描述:获取单元格中的图片数据
*/
public static String getCellImageByDoc(TableCell cell, PicturesTable picturesTable) {
CharacterRun cr = cell.getCharacterRun(0);
if (picturesTable.hasPicture(cr)) {
Picture pic = picturesTable.extractPicture(cr, true);
byte[] picbyte = pic.getContent();
String fileName = UUID.randomUUID().toString() + ".jpg";
String tmpDir = System.getProperty("java.io.tmpdir");
String filePath = tmpDir + fileName;
File file = byte2File(picbyte, filePath);
return file.getAbsolutePath();
}
return "";
}
/**
* 功能描述:获取单元格中的图片数据
*/
public static String getCellImageByDocs(XWPFTableCell cell) {
List<XWPFParagraph> xwpfParagraphs = cell.getParagraphs();
if (xwpfParagraphs == null) {
return null;
}
for (XWPFParagraph xwpfParagraph : xwpfParagraphs) {
List<XWPFRun> xwpfRunList = xwpfParagraph.getRuns();
if (xwpfRunList == null) {
return null;
}
for (XWPFRun xwpfRun : xwpfRunList) {
List<XWPFPicture> xwpfPictureList = xwpfRun.getEmbeddedPictures();
if (xwpfPictureList == null) {
return null;
}
for (XWPFPicture xwpfPicture : xwpfPictureList) {
String fileName = UUID.randomUUID().toString() + ".jpg";
String tmpDir = System.getProperty("java.io.tmpdir");
String filePath = tmpDir + fileName;
File file = byte2File(xwpfPicture.getPictureData().getData(), filePath);
return file.getAbsolutePath();
}
}
}
return "";
}
/**
* 根据byte数组,生成文件
*
* @param bfile 文件数组
* @param filePath 文件存放路径
*/
public static File byte2File(byte[] bfile, String filePath) {
BufferedOutputStream bos = null;
FileOutputStream fos = null;
File file = null;
try {
file = new File(filePath);
fos = new FileOutputStream(file);
bos = new BufferedOutputStream(fos);
bos.write(bfile);
return file;
} catch (Exception e) {
log.error(LICENCE_IO_ERROR_LOG_MSG, e.getMessage(), e);
throw new BusinessException(READ_WORD_ERROR_MSG);
} finally {
try {
if (bos != null) {
bos.close();
}
if (fos != null) {
fos.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
读取word文档表格解析工具类
最新推荐文章于 2024-07-19 17:37:28 发布