JAVA获取word文档内容
记录:最近在做一些关于OCR识别方面的项目,在这里摘出识别word文档这一部分稍做记录一下,以免以后自己忘记了,也能给有需要的朋友一点帮助。
新建一个test.docx的文档,里面随便输入点内容:
输出:
具体实现:
package test;
import org.apache.poi.POIXMLDocument;
import org.apache.poi.POIXMLTextExtractor;
import org.apache.poi.hwpf.extractor.WordExtractor;
import org.apache.poi.openxml4j.exceptions.OpenXML4JException;
import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.xwpf.extractor.XWPFWordExtractor;
import org.apache.xmlbeans.XmlException;
import java.io.*;
public class testWordToText {
public static void main(String[] args) {
// 要读取的文件路径
String path = "C:/Users/Administrator/Desktop/test.docx";
wordToText(path);
}
public static void wordToText(String path) {
// 需要判断文档是.doc还是.docx
if (path.endsWith(".doc")) {
try {
InputStream is = new FileInputStream(new File(path));
WordExtractor we = new WordExtractor(is);
// result就是识别结果
String result = StringUtils.replaceSpecialStr(we.getText());
we.close();
// 用err可以在控制台中看起来比较方便(个人习惯)
System.err.println(result);
} catch (IOException ioException) {
ioException.printStackTrace();
}
} else if (path.endsWith(".docx")) {
try {
OPCPackage opcPackage = POIXMLDocument.openPackage(path);
POIXMLTextExtractor extractor = new XWPFWordExtractor(opcPackage);
// result就是识别结果
String result = StringUtils.replaceSpecialStr(extractor.getText());
extractor.close();
System.err.println(result);
} catch (IOException ioException) {
ioException.printStackTrace();
} catch (XmlException e) {
e.printStackTrace();
} catch (OpenXML4JException e) {
e.printStackTrace();
}
} else {
System.err.println("请选择一个正确的word文档路径");
}
}
}