分享一个利用poi读取doc和docx中某文字下最接近的表格数据的方法
- 引入如下包
由于我这边用到easyexcel的其他功能,所以引用了这个包,这个包里包含了操作docx的包
<!-- 阿里云easyexcel -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>
<version>3.1.3</version>
</dependency>
<!-- 操作doc -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
- filePath为文件路径,findStr为需要定位的文字
public static void main(String[] args) {
// testTable("C:\\Users\\zhouchuxiang\\Desktop\\新建 DOCX 文档.docx", "11");
testTable("C:\\Users\\zhouchuxiang\\Desktop\\新建 DOCX 文档 (2).docx", "性别");
}
private static void testTable(String filePath, String findStr) {
InputStream is = null;
try {
is = new FileInputStream(filePath);
if (filePath.endsWith(".docx")) {
handleDocx(is, findStr);
} else if (filePath.endsWith(".doc")) {
handleDoc(is, findStr);
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (is != null) {
is.close();
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
- 以下是操作doc的方法
static void handleDoc(InputStream inputStream, String findStr) throws Exception {
//word 2003: 图片不会被读取
HWPFDocument hwpf = new HWPFDocument(inputStream);
//得到文档的读取范围
Range range = hwpf.getRange();
//是否可以查找表格元素标识
boolean flagToFindTable = false;
//获取段落数
int paraNum = range.numParagraphs();
for (int temp = 0; temp < paraNum; temp++) {
//读取段落
Paragraph paragraph = range.getParagraph(temp);
//System.out.println("段落"+temp+"的值:"+paragraph.text());
//当前段落包含查询文字 但查询文字不在表格中
if (paragraph.text().contains(findStr) && !paragraph.isInTable() && !flagToFindTable) {
flagToFindTable = true;
}
//查找到 查询文字 后再执行
if (!flagToFindTable) {
continue;
}
//确认段落为表格首个段落
if (paragraph.isInTable()) {
Table tb = range.getTable(paragraph);
//迭代行,默认从0开始
for (int i = 0; i < tb.numRows(); i++) {
TableRow tr = tb.getRow(i);
//迭代列,默认从0开始
for (int j = 0; j < tr.numCells(); j++) {
//取得单元格
TableCell td = tr.getCell(j);
//取得单元格的内容
String tempStr = "";
for (int k = 0; k < td.numParagraphs(); k++) {
Paragraph para = td.getParagraph(k);
String s = para.text().trim();
tempStr += s;
}
System.out.print(tempStr + "\t");
}
System.out.println();
}
break;
}
}
}
- 以下是操作docx的方法
static void handleDocx(InputStream inputStream, String findStr) throws Exception {
XWPFDocument document = new XWPFDocument(inputStream);
List<IBodyElement> bodyElements = document.getBodyElements();
//是否可以查找表格元素标识
boolean flagToFindTable = false;
for (IBodyElement bodyElement : bodyElements) {
BodyElementType elementType = bodyElement.getElementType();
//按顺序循环 先查前面哪个段落有查询文字
if (elementType == BodyElementType.PARAGRAPH && !flagToFindTable) {
//段落
XWPFParagraph para = (XWPFParagraph) bodyElement;
List<XWPFRun> runs = para.getRuns();
if (runs == null || runs.isEmpty()) {
continue;
}
for (XWPFRun run : runs) {
//如果片段没有文字,可能该片段是图片
if (StringUtils.isNotEmpty(run.text())) {
//该片段不为空
if (run.text().contains(findStr)) {
//如果该段落包含这段查询文字
flagToFindTable = true;
}
}
}
}
if (elementType == BodyElementType.TABLE && flagToFindTable) {
//表格
XWPFTable table = (XWPFTable) bodyElement;
List<XWPFTableRow> rows = table.getRows();
//读取每一行数据
for (int i = 0; i < rows.size(); i++) {
XWPFTableRow row = rows.get(i);
//读取每一列数据
List<XWPFTableCell> cells = row.getTableCells();
for (int j = 0; j < cells.size(); j++) {
XWPFTableCell cell = cells.get(j);
System.out.print(cell.getText() + "\t");
}
System.out.println();
}
break;
}
}
if (!flagToFindTable) {
System.out.println(String.format("该word文档无[%s]文字,请重新输入查找表格", findStr));
}
}
注意:这种方式只能识别原始的DOC,如果把DOCX文件通过修改文件扩展名的方式来改成DOC,代码是无法识别的