1、需要的pom文件依赖
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.1.0</version>
</dependency>
<!-- POI-word文件处理需要 -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>4.1.2</version>
</dependency>
2、word分为2003版本(.doc),和2007版本(.docx),所对应读取的方式不一样。
/**
* 获取正文文件内容,docx方法
*
* @param path
* @return
*/
public Map<String, String> getContentDocx(String path) {
Map<String, String> map = new HashMap();
StringBuffer content = new StringBuffer("");
String result = "0"; // 0表示获取正常,1表示获取异常
InputStream is = null;
try {
is = new FileInputStream(new File(path));
// 2007版本的word
XWPFDocument xwpf = new XWPFDocument(is); // 2007版本,仅支持docx文件处理
List<XWPFParagraph> paragraphs = xwpf.getParagraphs();
if (paragraphs != null && paragraphs.size() > 0) {
for (XWPFParagraph paragraph : paragraphs) {
if (!paragraph.getParagraphText().startsWith(" ")) {
content.append(" ").append(paragraph.getParagraphText().trim()).append("\r\n");
} else {
content.append(paragraph.getParagraphText());
}
}
}
} catch (Exception e) {
logger.error("docx解析正文异常:" + e);
result = "1"; // 出现异常
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
logger.error("" + e);
}
}
map.put("result", result);
map.put("content", content.toString());
}
return map;
}
/**
* 获取正文文件内容,doc方法
*
* @param path
* @return
*/
public Map<String, String> getContentDoc(String path) {
Map<String, String> map = new HashMap();
StringBuffer content = new StringBuffer("");
String result = "0"; // 0表示获取正常,1表示获取异常
InputStream is = null;
try {
is = new FileInputStream(new File(path));
// 2003版本的word
WordExtractor extractor = new WordExtractor(is); // 2003版本 仅doc格式文件可处理,docx文件不可处理
String[] paragraphText = extractor.getParagraphText(); // 获取段落,段落缩进无法获取,可以在前添加空格填充
if (paragraphText != null && paragraphText.length > 0) {
for (String paragraph : paragraphText) {
if (!paragraph.startsWith(" ")) {
content.append(" ").append(paragraph.trim()).append("\r\n");
} else {
content.append(paragraph);
}
}
}
} catch (Exception e) {
logger.error("doc解析正文异常:" + e);
result = "1"; // 出现异常
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
logger.error("" + e);
}
}
map.put("result", result);
map.put("content", content.toString());
}
return map;
}
/**
* 获取正文文件内容,wps方法
*
* @param path
* @return
*/
public Map<String, String> getContentWps(String path) {
Map<String, String> map = new HashMap();
StringBuffer content = new StringBuffer("");
String result = "0"; // 0表示获取正常,1表示获取异常
InputStream is = null;
try {
is = new FileInputStream(new File(path));
// wps版本word
HWPFDocument hwpf = new HWPFDocument(is);
WordExtractor wordExtractor = new WordExtractor(hwpf);
// 文档文本内容
String[] paragraphText1 = wordExtractor.getParagraphText();
if (paragraphText1 != null && paragraphText1.length > 0) {
for (String paragraph : paragraphText1) {
if (!paragraph.startsWith(" ")) {
content.append(" ").append(paragraph.trim()).append("\r\n");
} else {
content.append(paragraph);
}
}
}
} catch (Exception e) {
logger.error("wps解析正文异常:" + e);
result = "1"; // 出现异常
} finally {
if (is != null) {
try {
is.close();
} catch (IOException e) {
logger.error("" + e);
}
}
map.put("result", result);
map.put("content", content.toString());
}
return map;
}
3、上面的代码示例获取到的仅是word文件中的文字内容,不包含图片,表格等数据。下方的示例代码可以获取word内容,包含获取页眉,页脚,图片,表格等进行操作。具体可以查看下源码或者搜索查看。
XWPFDocument doc = new XWPFDocument(is);
List<XWPFParagraph> paras = doc.getParagraphs();
if (!CollectionUtils.isEmpty(paras)) {
for (XWPFParagraph para : paras) {
//当前段落的属性
//CTPPr pr = para.getCTP().getPPr();
System.out.println(para.getText());
}
}
学海无涯苦作舟!!!