本文是word2007版本也就是适合文件类型为.docx文件的解析过程,主要是对XWPFParagraph、XWPFTable对象解析。
一、文件上传
首先从前端传入word文件,可以将文件上传到服务器、本地项目中、临时路径,主要是为了获取到文件的绝对路径,用于将MultipartFile类型转为File类型。
二、文件解析
1、上传文件解析,这里业务需求是将解析出来的字段里面有特殊标识,识别到后替换。
File f = new File(file);
if (!f.exists()) {
throw new ServiceException("此文件不存在!");
} else {
//word2007
if (f.getName().endsWith(".docx") || f.getName().endsWith(".DOCX")) {
// 1) 加载word文档生成 XWPFDocument对象
InputStream in = new FileInputStream(f);
XWPFDocument document = new XWPFDocument(in);
// 替换模版标签
//段落
List<XWPFParagraph> xwpfParagraphs = document.getParagraphs();
if(CollectionUtils.isNotEmpty(xwpfParagraphs)){
for(XWPFParagraph xwpfParagraph : xwpfParagraphs){
this.replaceRun(xwpfParagraph);
}
}
//表格
List<XWPFTable> xwpfTables = document.getTables();
if(CollectionUtils.isNotEmpty(xwpfTables)){
for(XWPFTable xwpfTable : xwpfTables){
if(null != xwpfTable){
for(XWPFTableRow xwpfTableRow : xwpfTable.getRows()){
if(null != xwpfTableRow){
for(XWPFTableCell xwpfTableCell : xwpfTableRow.getTableCells()){
if(null != xwpfTableCell){
for(XWPFParagraph xwpfParagraph : xwpfTableCell.getParagraphs()){
this.replaceRun(xwpfParagraph);
}
}
}
}
}
}
}
}
// 2) 解析 XHTML配置 (这里设置IURIResolver来设置图片存放的目录)
File imageFolderFile = new File(filePath);
XHTMLOptions options = XHTMLOptions.create().URIResolver(new FileURIResolver(imageFolderFile));
options.setExtractor(new FileImageExtractor(imageFolderFile));
options.setIgnoreStylesIfUnused(false);
options.setFragment(true);
// 3) 将 XWPFDocument转换成XHTML
File newOutFile = new File(filePath + htmlName);
OutputStream newout = new FileOutputStream(newOutFile);
XHTMLConverter.getInstance().convert(document, newout, options);
BufferedReader newin1 = new BufferedReader(new FileReader(filePath + htmlName));
String stringNewHtml = newin1.readLine();
//反义字符(word中特殊字符例如空格会解析为&所以需要反义)
stringNewHtml = StringEscapeUtils.unescapeHtml4(stringNewHtml);
//关闭流
newin1.close();
newHtml = stringNewHtml;
//删除生成html文件
newOutFile.delete();
} else {
//类型为doc文件解析 Word2003
InputStream input = new FileInputStream(new File(file));
HWPFDocument wordDocument = new HWPFDocument(input);
Range range = wordDocument.getRange();
StringBuilder text = wordDocument.getText();
String tableStr = text.toString();
List<String> strs=match(tableStr);
if(CollectionUtils.isNotEmpty(strs)){
for(String str : strs){
this.replacePragrahText(str,range);
}
}
WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
//解析word文档
wordToHtmlConverter.processDocument(wordDocument);
Document htmlDocument = wordToHtmlConverter.getDocument();
File htmlFile = new File(filePath + htmlName);
OutputStream outStream = new FileOutputStream(htmlFile);
//也可以使用字符数组流获取解析的内容
// ByteArrayOutputStream baos = new ByteArrayOutputStream();
// OutputStream outStream = new BufferedOutputStream(baos);
DOMSource domSource = new DOMSource(htmlDocument);
StreamResult streamResult = new StreamResult(outStream);
TransformerFactory factory = TransformerFactory.newInstance();
Transformer serializer = factory.newTransformer();
serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
serializer.setOutputProperty(OutputKeys.INDENT, "yes");
serializer.setOutputProperty(OutputKeys.METHOD, "html");
serializer.transform(domSource, streamResult);
//也可以使用字符数组流获取解析的内容
BufferedReader in1 = new BufferedReader(new FileReader(filePath + htmlName));
String stringHtml = in1.readLine();
stringHtml = StringEscapeUtils.unescapeHtml4(stringHtml);
htmlFile.delete();
outStream.close();
}
}
public void replaceRun(XWPFParagraph xwpfParagraph){
if(null != xwpfParagraph){
if(CollectionUtils.isNotEmpty(xwpfParagraph.getRuns())){
for(int i=0;i<xwpfParagraph.getRuns().size();i++){
XWPFRun xwpfRunObj = xwpfParagraph.getRuns().get(i);
String xwpfRun = xwpfRunObj.toString();
if(null!=xwpfRun){
if(xwpfRun.contains("标识")){
String str = xwpfParagraph.getRuns().get(i).toString();
str = str.substring(1);
str = str.substring(0,str.length()-1);
String[] anlyStr = str.split("&");
//这里根据自身业务需求做判断
}
}
}
}
}
}
2、字符串解析。这里使用正则表达式获取<<和>>符号之间的字符串。
String newStr = str;
List<String> results = new ArrayList<String>();
Pattern p = Pattern.compile("<<(.*?)>>",Pattern.DOTALL);
Matcher m = p.matcher(str);
while (m.find()) {
if(!m.group().equals("")){
//包括<<和>>符号
results.add(m.group());
//不包括<<和>>符号
results.add(m.group(1));
}
}