很多人使用POI读取word的时候都会这么写:
1
2
3
|
XWPFDocument document =
new
XWPFDocument(inputStream);
System.out.println(
new
XWPFWordExtractor(document).getText());
|
但是这个方法其实有非常多的问题的,文本框里面的内容读取不到,换行也有问题。那么我改进了一下:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
|
/**
* 处理2007+的WORD
* @param filePath 文件地址
* @return word内容
*/
private
static
String read2007(String filePath) {
InputStream inputStream =
null
;
StringBuffer content =
new
StringBuffer();
try
{
inputStream =
new
FileInputStream(
new
File(filePath));
XWPFDocument document =
new
XWPFDocument(inputStream);
// 读取非表格文本框
for
(XWPFParagraph xwpfParagraph : document.getParagraphs()) {
for
(XWPFRun xwpfRun : xwpfParagraph.getRuns()) {
content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE);
}
}
// 读取表格内文本框
for
(XWPFTable xwpfTable : document.getTables()) {
for
(XWPFTableRow xwpfTableRow : xwpfTable.getRows()) {
for
(XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) {
for
(XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) {
for
(XWPFRun xwpfRun : xwpfParagraph.getRuns()) {
content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE);
}
}
}
}
}
// 读取表格内容
for
(XWPFTable xwpfTable : document.getTables()) {
for
(XWPFTableRow xwpfTableRow : xwpfTable.getRows()) {
for
(XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) {
for
(XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) {
content.append(xwpfParagraph.getText()).append(NEW_LINE);
}
}
}
}
return
content.toString();
}
catch
(IOException e) {
logger.error(
"解析word错误,文件地址:"
+ filePath, e);
}
finally
{
IOUtils.closeQuietly(inputStream);
}
return
null
;
}
/**
* 获取XML内容,可以使用递归cursor.getDomNode()
* @param xml xml
* @return xml内容
*/
private
static
String getXMLContent(String xml) {
StringBuffer content =
new
StringBuffer();
Document document;
try
{
document = DocumentHelper.parseText(xml);
List<?> namespaces = document.getRootElement().declaredNamespaces();
// 判断是否有表格包含文本框
boolean
hasboxintab =
false
;
for
(Object object : namespaces) {
Namespace namespace = (Namespace) object;
if
(NAMESPANCE_OF_TEXTBOX_IN_TABLE.equals(namespace.getPrefix())) {
hasboxintab =
true
;
break
;
}
}
if
(!hasboxintab)
return
content.toString();
for
(Object node : document.selectNodes(
"//mc:Fallback//w:p"
)) {
for
(Object nodeb : ((Node) node).selectNodes(
".//w:t"
)) {
if
(StringUtils.isNotEmpty(((Node) nodeb).getText()))
content.append(((Node) nodeb).getText());
}
content.append(NEW_LINE);
}
}
catch
(DocumentException e) {
logger.error(
"XML转化错误,内容:"
+ xml, e);
}
return
content.toString();
}
|
2003版本简单一些:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
/**
* 处理2003的WORD
* @param filePath 文件地址
* @return word内容
*/
private
static
String read2003(String filePath) {
InputStream inputStream =
null
;
StringBuffer content =
new
StringBuffer();
try
{
inputStream =
new
FileInputStream(
new
File(filePath));
HWPFDocument document =
new
HWPFDocument(inputStream);
String text =
null
;
for
(
int
i =
0
; i < document.getMainTextboxRange().numParagraphs(); i++) {
// 文本框
text = document.getMainTextboxRange().getParagraph(i).text();
if
(StringUtils.isNotEmpty(text))
content.append(text).append(NEW_LINE);
}
for
(
int
i =
0
; i < document.getRange().numParagraphs(); i++) {
// 非文本框
text = document.getRange().getParagraph(i).text();
if
(StringUtils.isNotEmpty(text) && StringUtils.isNotEmpty(text.trim()))
// 注意这里的trim()方法否者会出现乱码
content.append(text.trim()).append(NEW_LINE);
}
return
content.toString();
}
catch
(FileNotFoundException e) {
logger.error(
"解析word错误,文件地址:"
+ filePath, e);
}
catch
(IOException e) {
logger.error(
"解析word错误,文件地址:"
+ filePath, e);
}
finally
{
IOUtils.closeQuietly(inputStream);
}
return
null
;
}
|
注意:读取出的内容为表格里面的内容,文本框内容和直接写在编辑区里面的文本,其他的一些诸如:批注,引用等一些信息可能读取不到,需要的请自行解决。
比较完整的代码:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
|
import
java.io.File;
import
java.io.FileInputStream;
import
java.io.FileNotFoundException;
import
java.io.IOException;
import
java.io.InputStream;
import
org.apache.commons.io.FilenameUtils;
import
org.apache.commons.io.IOUtils;
import
org.apache.commons.lang.StringUtils;
import
org.apache.log4j.Logger;
import
org.apache.poi.hwpf.HWPFDocument;
import
org.apache.poi.xwpf.usermodel.XWPFDocument;
import
org.apache.poi.xwpf.usermodel.XWPFParagraph;
import
org.apache.poi.xwpf.usermodel.XWPFRun;
import
org.apache.poi.xwpf.usermodel.XWPFTable;
import
org.apache.poi.xwpf.usermodel.XWPFTableCell;
import
org.apache.poi.xwpf.usermodel.XWPFTableRow;
import
org.dom4j.Document;
import
org.dom4j.DocumentException;
import
org.dom4j.DocumentHelper;
import
org.dom4j.Node;
/**
* WordReaderUtils - WORD 读取
*
* @author 500d Team
* @version 1.0
*/
public
class
WordReaderUtils {
private
static
final
String WORD_2003 =
"doc"
;
private
static
final
String WORD_2007 =
"docx"
;
private
static
final
Logger logger = Logger.getLogger(WordReaderUtils.
class
);
public
static
final
String NEW_LINE =
"\r\n"
;
public
static
String read(String filePath) {
File wordFile = StringUtils.isNotEmpty(filePath) ?
new
File(filePath) :
null
;
if
(wordFile ==
null
|| !wordFile.exists() || !wordFile.isFile())
return
null
;
String extension = FilenameUtils.getExtension(filePath);
if
(StringUtils.isEmpty(extension))
return
null
;
String content =
null
;
if
(WORD_2003.equals(extension.toLowerCase()))
content = read2003(filePath);
else
if
(WORD_2007.equals(extension.toLowerCase()))
content = read2007(filePath);
return
Crossover.handle(content);
}
/**
* 处理2003的WORD
* @param filePath 文件地址
* @return word内容
*/
private
static
String read2003(String filePath) {
InputStream inputStream =
null
;
StringBuffer content =
new
StringBuffer();
try
{
inputStream =
new
FileInputStream(
new
File(filePath));
HWPFDocument document =
new
HWPFDocument(inputStream);
String text =
null
;
for
(
int
i =
0
; i < document.getMainTextboxRange().numParagraphs(); i++) {
text = document.getMainTextboxRange().getParagraph(i).text();
if
(StringUtils.isNotEmpty(text))
content.append(text).append(NEW_LINE);
}
for
(
int
i =
0
; i < document.getRange().numParagraphs(); i++) {
text = document.getRange().getParagraph(i).text();
if
(StringUtils.isNotEmpty(text) && StringUtils.isNotEmpty(text.trim()))
// 注意这里的trim()方法否者会出现乱码
content.append(text.trim()).append(NEW_LINE);
}
return
content.toString();
}
catch
(FileNotFoundException e) {
logger.error(
"解析word错误,文件地址:"
+ filePath, e);
}
catch
(IOException e) {
logger.error(
"解析word错误,文件地址:"
+ filePath, e);
}
finally
{
IOUtils.closeQuietly(inputStream);
}
return
null
;
}
/**
* 处理2007+的WORD
* @param filePath 文件地址
* @return word内容
*/
private
static
String read2007(String filePath) {
InputStream inputStream =
null
;
StringBuffer content =
new
StringBuffer();
try
{
inputStream =
new
FileInputStream(
new
File(filePath));
XWPFDocument document =
new
XWPFDocument(inputStream);
// 读取非表格文本框
for
(XWPFParagraph xwpfParagraph : document.getParagraphs()) {
for
(XWPFRun xwpfRun : xwpfParagraph.getRuns()) {
content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE);
}
}
// 读取表格内文本框
for
(XWPFTable xwpfTable : document.getTables()) {
for
(XWPFTableRow xwpfTableRow : xwpfTable.getRows()) {
for
(XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) {
for
(XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) {
for
(XWPFRun xwpfRun : xwpfParagraph.getRuns()) {
content.append(getXMLContent(xwpfRun.getCTR().newCursor().xmlText())).append(NEW_LINE);
}
}
}
}
}
// 读取表格内容
for
(XWPFTable xwpfTable : document.getTables()) {
for
(XWPFTableRow xwpfTableRow : xwpfTable.getRows()) {
for
(XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()) {
for
(XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()) {
content.append(xwpfParagraph.getText()).append(NEW_LINE);
}
}
}
}
return
content.toString();
}
catch
(IOException e) {
logger.error(
"解析word错误,文件地址:"
+ filePath, e);
}
finally
{
IOUtils.closeQuietly(inputStream);
}
return
null
;
}
/**
* 获取XML内容,可以使用递归cursor.getDomNode()
* @param xml xml
* @return xml内容
*/
private
static
String getXMLContent(String xml) {
StringBuffer content =
new
StringBuffer();
Document document;
try
{
document = DocumentHelper.parseText(xml);
List<?> namespaces = document.getRootElement().declaredNamespaces();
// 判断是否有表格包含文本框
boolean
hasboxintab =
false
;
for
(Object object : namespaces) {
Namespace namespace = (Namespace) object;
if
(NAMESPANCE_OF_TEXTBOX_IN_TABLE.equals(namespace.getPrefix())) {
hasboxintab =
true
;
break
;
}
}
if
(!hasboxintab)
return
content.toString();
for
(Object node : document.selectNodes(
"//mc:Fallback//w:p"
)) {
for
(Object nodeb : ((Node) node).selectNodes(
".//w:t"
)) {
if
(StringUtils.isNotEmpty(((Node) nodeb).getText()))
content.append(((Node) nodeb).getText());
}
content.append(NEW_LINE);
}
}
catch
(DocumentException e) {
logger.error(
"XML转化错误,内容:"
+ xml, e);
}
return
content.toString();
}
public
static
void
main(String[] args)
throws
Exception {
// System.out.println(read("e://company/test.doc"));
// System.out.println(read("e://company/test.docx"));
}
}
|
参考文档:http://www.acgist.com/article/206.html