需求:项目中需要获取word文档的最终文本,判断文本的内容进行接下来业务;
困难:word文档中存在修订模式或者审阅模式,直接使用网上找的到资料都会把审阅模式的内容一起加载到文本里;
/**
* @param filePath 文件地址
* @return 文本全部内容
*/
public static String getAllText(String filePath) {
String text = “”;
boolean xwpfFail = false;
try {
text = getAllTextByXWPFDocument(filePath);
} catch (Exception e) {
logger.error(“使用XWPFDocument读取word文本出错:”, e);
xwpfFail = true;
}
if (xwpfFail) {
logger.info(“使用HWPFDocument读取word文本开始。。。。”);
try {
text = getAllTextByHWPFDocument(filePath);
} catch (Exception ex) {
throw new RuntimeException(“读取word文本失败”);
}
}
return text;
}
/**
* 使用XWPFDocument读取docx文件
*
* @param filePath 文件地址
* @return 文本全部内容
*/
private static String getAllTextByXWPFDocument(String filePath) {
XWPFDocument xwpfDocument;
FileInputStream redTemplateInputStream = null;
StringBuffer sbText = new StringBuffer();
try {
int startPoint = 0;
redTemplateInputStream = new FileInputStream(new File(filePath));
byte[] templateByte = new byte[redTemplateInputStream.available()];
redTemplateInputStream.read(templateByte);
if (templateByte.length == 0) {
return null;
}
xwpfDocument = new XWPFDocument(new ByteArrayInputStream(templateByte));//构建模板文档对象
List<XWPFParagraph> paragraphs = xwpfDocument.getParagraphs();//获取模板里的段落对象
for (int i = 0; i < paragraphs.size(); i++) {//遍历模板中的段落对象找到标记位置
XWPFParagraph xwpfParagraph = paragraphs.get(i);
List<XWPFRun> runs = xwpfParagraph.getRuns();
for (int l = 0; l < runs.size(); l++) {
XWPFRun run = runs.get(l);
String text = run.getText(0);
if (!StringUtil.isEmpty(text) && text != "null") {
sbText.append(text);
}
}
if (startPoint != 0) {
break;//退出遍历paragraph的循环
}
}
logger.info("poi使用XWPFDocument解析word文本结束。。。。");
return sbText.toString();
} catch (Exception e) {
logger.error("poi使用XWPFDocument解析word文本失败,错误信息:", e);
throw new RuntimeException("XWPFDocument解析word文本失败");
} finally {
if (null != redTemplateInputStream) {
try {
redTemplateInputStream.close();
} catch (IOException e) {
}
}
}
}
/**
* 使用HWPFDocument读取doc文件
*
* @param filePath 文件地址
* @return 文本全部内容
*/
private static String getAllTextByHWPFDocument(String filePath) {
FileInputStream in = null;
StringBuffer sbText = new StringBuffer();
try {
in = new FileInputStream(new File(filePath));
HWPFDocument doc = new HWPFDocument(in);
Range r = doc.getRange();
for (int x = 0; x < r.numSections(); x++) {
Section s = r.getSection(x);
for (int y = 0; y < s.numParagraphs(); y++) {
Paragraph p = s.getParagraph(y);
for (int z = 0; z < p.numCharacterRuns(); z++) {
CharacterRun run = p.getCharacterRun(z);
boolean markedDeleted = run.isMarkedDeleted();
if (!markedDeleted) {
String text = run.text();
sbText.append(text);
}
}
}
}
logger.info("使用HWPFDocument解析word文本结束。。。。");
return sbText.toString();
} catch (Exception e) {
throw new RuntimeException("读取word文本失败");
} finally {
if (null != in) {
try {
in.close();
} catch (Exception ex) {
}
}
}
}