如下代码可以实现使用Java的POI进行Word文档的解析并生成XML格式文档功能,此代码编译通过,但是运行有问题,读者可以亲自试试并能否改bug:
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.nio.charset.Charset;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.model.StyleDescription;
import org.apache.poi.hwpf.model.StyleSheet;
import org.apache.poi.hwpf.usermodel.CharacterRun;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
public final class Word2Forrest {
Writer _out;
HWPFDocument _doc;
@SuppressWarnings("unused")
public Word2Forrest(HWPFDocument doc, OutputStream stream) throws IOException {
OutputStreamWriter out = new OutputStreamWriter(stream, Charset.forName("UTF-8"));
_out = out;
_doc = doc;
init();
openDocument();
openBody();
Range r = doc.getRange();
StyleSheet styleSheet = doc.getStyleSheet();
int sectionLevel = 0;
int lenParagraph = r.numParagraphs();
boolean inCode = false;
for (int x = 0; x < lenParagraph; x++) {
Paragraph p = r.getParagraph(x);
String text = p.text();
if (text.trim().length() == 0) {
continue;
}
StyleDescription paragraphStyle = styleSheet.getStyleDescription(p.getStyleIndex());
String styleName = paragraphStyle.getName();
if (styleName.startsWith("Heading")) {
if (inCode) {
closeSource();
inCode = false;
}
int headerLevel = Integer.parseInt(styleName.substring(8));
if (headerLevel > sectionLevel) {
openSection();
} else {
for (int y = 0; y < (sectionLevel - headerLevel) + 1; y++) {
closeSection();
}
openSection();
}
sectionLevel = headerLevel;
openTitle();
System.out.println("++++++" + p.text());
writePlainText(text);
closeTitle();
} else {
int cruns = p.numCharacterRuns();
CharacterRun run = p.getCharacterRun(0);
String fontName = run.getFontName();
if (fontName.startsWith("Courier")) {
if (!inCode) {
openSource();
inCode = true;
}
System.out.println("------" + p.text());
writePlainText(p.text());
} else {
if (inCode) {
inCode = false;
closeSource();
}
openParagraph();
System.out.println("******" + p.text());
writePlainText(p.text());
closeParagraph();
}
}
}
for (int x = 0; x < sectionLevel; x++) {
closeSection();
}
closeBody();
closeDocument();
_out.flush();
}
public void init() throws IOException {
_out.write("<?xml version=\"1.0\" encoding=\"UTF-8\"?>\r\n");
_out.write(
"<!DOCTYPE document PUBLIC \"-//APACHE//DTD Documentation V1.1//EN\" \"./dtd/document-v11.dtd\">\r\n");
}
public void openDocument() throws IOException {
_out.write("<document>\r\n");
}
public void closeDocument() throws IOException {
_out.write("</document>\r\n");
}
public void openBody() throws IOException {
_out.write("<body>\r\n");
}
public void closeBody() throws IOException {
_out.write("</body>\r\n");
}
public void openSection() throws IOException {
_out.write("<section>");
}
public void closeSection() throws IOException {
_out.write("</section>");
}
public void openTitle() throws IOException {
_out.write("<title>");
}
public void closeTitle() throws IOException {
_out.write("</title>");
}
public void writePlainText(String text) throws IOException {
_out.write(text);
}
public void openParagraph() throws IOException {
_out.write("<p>");
}
public void closeParagraph() throws IOException {
_out.write("</p>");
}
public void openSource() throws IOException {
_out.write("<source><![CDATA[");
}
public void closeSource() throws IOException {
_out.write("]]></source>");
}
public static void main(String[] args) throws IOException {
InputStream is = new FileInputStream("D:/QMDownload/hwpftest.doc");
OutputStream out = new FileOutputStream("D:/QMDownload/test.xml");
try {
new Word2Forrest(new HWPFDocument(is), out);
} finally {
out.close();
is.close();
}
}
}