##把docx里面的文字,公式和图片转成html大体需要如下的方式
- 把docx的文字转成poi的XWPFDocument
InputStream is=new FileInputStream("d:\\1.docx");
XWPFDocument docx = new XWPFDocument(is);
- 得到内容的列表,包括XWPFParagraph和XWPFTable 通过BodyElementType区分
List<IBodyElement> eles = docx.getBodyElements();
for (IBodyElement e : eles) {
if (e.getElementType().equals(BodyElementType.PARAGRAPH)){
XWPFParagraph p = (XWPFParagraph) e;
handleParagraph(e):
}else if(e.getElementType().equals(BodyElementType.TABLE){
handleTable(e);
}
- 得到XWPFParagraph 后,通过如下两个方法得到XWPFParagraph里面的具体内容
List<XWPFRun> runs = p.getRuns();//文本和图片
List<XWPFPicture> pics = run.getEmbeddedPictures();//得到所有图片
再把图片保存起来就可以了。
List<CTOMath> oMathList = p.getCTP().getOMathList();//公式
//公式这个就复杂了CTOMath属于XmlObject形式的xml文件,属于OMML,要先转成MathML,再把MathML转成png,保存到硬盘上。
//把XmlObject转成MathML
private static String getMathML(XmlObject xmlObject) throws Exception {
final String xslFile = "/cn/com/eduedu/jee/util/OMML2MML.XSL";
StreamSource stylesource = new StreamSource(MSDocxUtils.class.getResourceAsStream(xslFile));
Transformer transformer = TransformerFactory.newInstance().newTransformer(stylesource);
Node node = xmlObject.getDomNode();
DOMSource source = new DOMSource(node);
StringWriter stringwriter = new StringWriter();
StreamResult result = new StreamResult(stringwriter);
transformer.setOutputProperty("omit-xml-declaration", "yes");
transformer.transform(source, result);
String mathML = stringwriter.toString();
stringwriter.close();
// The native OMML2MML.XSL transforms OMML into MathML as XML having special
// name spaces.
// We don't need this since we want using the MathML in HTML, not in XML.
// So ideally we should changing the OMML2MML.XSL to not do so.
// But to take this example as simple as possible, we are using replace to get
// rid of the XML specialities.
mathML = mathML.replaceAll("xmlns:m=\"http://schemas.openxmlformats.org/officeDocument/2006/math\"", "");
mathML = mathML.replaceAll("xmlns:mml", "xmlns");
mathML = mathML.replaceAll("mml:", "");
return mathML;
}
//MathML转成Document
private static Document convertStringToDocument(String xmlStr) {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
DocumentBuilder builder;
try
{
builder = factory.newDocumentBuilder();
Document doc = builder.parse( new InputSource( new StringReader( xmlStr ) ) );
return doc;
} catch (Exception e) {
e.printStackTrace();
}
return null;
}
//最后的代码是这样的
private static String convertOmathToPng(XmlObject xmlObject,MSDocxToHtmlImageParser imageParser) throws Exception {
pngNumber++;
Document document=convertStringToDocument(getMathML(xmlObject));
Converter mathMLConvert =Converter.getInstance();
LayoutContextImpl localLayoutContextImpl = new LayoutContextImpl(LayoutContextImpl.getDefaultLayoutContext());
localLayoutContextImpl.setParameter(Parameter.MATHSIZE, 18);
ByteArrayOutputStream os=new ByteArrayOutputStream();
mathMLConvert.convert(document,os, "image/png", localLayoutContextImpl);
String pngName=imageParser.parse(os.toByteArray(), "png_"+pngNumber+".png");
os.close();
return "<img src='"+pngName+"'/>";
}
###用到的所有包
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.math.BigInteger;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import org.apache.poi.POIXMLProperties;
import org.apache.poi.xwpf.usermodel.BodyElementType;
import org.apache.poi.xwpf.usermodel.IBodyElement;
import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
import org.apache.poi.xwpf.usermodel.VerticalAlign;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
import org.apache.poi.xwpf.usermodel.XWPFPicture;
import org.apache.poi.xwpf.usermodel.XWPFRun;
import org.apache.poi.xwpf.usermodel.XWPFStyles;
import org.apache.poi.xwpf.usermodel.XWPFTable;
import org.apache.poi.xwpf.usermodel.XWPFTableCell;
import org.apache.poi.xwpf.usermodel.XWPFTableRow;
import org.apache.xmlbeans.XmlObject;
import org.openxmlformats.schemas.officeDocument.x2006.math.CTOMath;
import org.openxmlformats.schemas.officeDocument.x2006.math.CTOMathPara;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import cn.com.eduedu.jee.util.wordnumber.IWordNumber;
import cn.com.eduedu.jee.util.wordnumber.WordNumberFactory;
import net.sourceforge.jeuclid.context.LayoutContextImpl;
import net.sourceforge.jeuclid.context.Parameter;
import net.sourceforge.jeuclid.converter.Converter;
这里有更详细的文章
这里有更详细的文章,能解决更多的问题