目录
背景
- 在参与到软件设计的文档修改时,需要将docx文档中的数据表录入到xml文档中,在第一次录入时,一个字段一个字段的录入实在是太过麻烦,遇到有几十个字段的表,眼睛都看花了还是录不完,于是想着使用代码去读取相应的表格,将其中的数据表生成xml文件。
- 当然,在写脚本的时候,大家可能首先想到的是使用Python进行编写。由于我的电脑中没有安装Python环境,就使用已有的环境编写了Java代码进行转换。在读取表的时候,还有很多问题可以改进,但是由于文档的规范不同,处理还不是很全面,现在将自己实现的额进行记录。
DOCX的数据表
表格式一
表格式二
针对遇到的两种格式的表,进行编写代码,生成相应的xml文件
代码实现
Docx2XMLUtil.java
package docx2xml; import org.apache.poi.xwpf.usermodel.XWPFDocument; import org.apache.poi.xwpf.usermodel.XWPFTable; import org.apache.poi.xwpf.usermodel.XWPFTableCell; import org.apache.poi.xwpf.usermodel.XWPFTableRow; import java.io.*; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Iterator; import java.util.List; /** * @ClassName Docx2XMLUtil * @Author StriveFarrell * @Date 2019/12/4 15:22 * @Description * 将docx文档章的表格转化为xml文档 */ public class Docx2XMLUtil { private String docxFilePath ; private String xmlFileSavePath ; private String author; public String getDocxFilePath() { return docxFilePath; } public void setDocxFilePath(String docxFilePath) { this.docxFilePath = docxFilePath; } public String getXmlFileSavePath() { return xmlFileSavePath; } public void setXmlFileSavePath(String xmlFileSavePath) { this.xmlFileSavePath = xmlFileSavePath; } public String getAuthor() { return author; } public void setAuthor(String author) { this.author = author; } public void getTableData(){ try { String filePath = getDocxFilePath(); if (filePath.toLowerCase().endsWith("docx")){ docx2xml(); }else if (filePath.toLowerCase().endsWith(".doc")){ doc2xml(); } }catch (Exception e){ e.printStackTrace(); } } private void docx2xml(){ XWPFDocument document = getXWPFDocument(); Iterator<XWPFTable> tabItr = document.getTablesIterator(); String tableHeaderInfo = getTableHeader(); String remInfo = getRemInfo(); int tableIndex = 1; while (tabItr.hasNext()){ StringBuffer tablexml = new StringBuffer(tableHeaderInfo); tablexml.append(remInfo); XWPFTable table = tabItr.next(); String tableColumnInfo = getTableColumn(table); tablexml.append(tableColumnInfo); String xmlString = tablexml.toString()+getEndTableTag()+"\n\n\n\n"; testPrint(String.valueOf(tableIndex), xmlString); saveXml(xmlString); tableIndex++; } } private void doc2xml(){ } /** * 打印测试 * @param message * @param out */ private void testPrint(String message,String out){ System.out.println(message+":\n"+out); } /** * 获取当前的日期,格式为yyyy.MM.dd * @return */ private String getDate(){ SimpleDateFormat df = new SimpleDateFormat("yyyy.MM.dd"); return df.format(new Date()); } private void saveXml(String data){ String saveXmlPath = getXmlFileSavePath(); try { FileWriter fw = new FileWriter(saveXmlPath, true); BufferedWriter bw = new BufferedWriter(fw); bw.write(data); bw.close(); fw.close(); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } /** * 判断数据类型是否有长度 * @param cell * @return */ private boolean hasSize(String cell){ if ("DATETIME".equalsIgnoreCase(cell)||"TEXT".equalsIgnoreCase(cell) || "TIMESTAMP".equalsIgnoreCase(cell) || "LONGTEXT".equalsIgnoreCase(cell)){ return false; } return true; } /** * 判断表格格式,tableStyle标记表的格式,字段和长度在一起为true,分开为false * @param header * @return */ private boolean getTableStyle(List<XWPFTableCell> header) { boolean isContains = false ; Iterator<XWPFTableCell> cellIterator = header.iterator(); while (cellIterator.hasNext()){ String cel = cellIterator.next().getText(); if (cel.contains("长度")){ isContains = true; break; } } return isContains; } /** * 获取文件输入流 * @return */ private FileInputStream getFileInputStream(){ FileInputStream in = null; try { in = new FileInputStream(getDocxFilePath()); }catch (FileNotFoundException e) { e.printStackTrace(); } return in; } /** * 获取docx文件流 * @return */ private XWPFDocument getXWPFDocument(){ FileInputStream in = getFileInputStream(); XWPFDocument document = null; try { document = new XWPFDocument(in); } catch (IOException e) { e.printStackTrace(); } return document; } /** * 获取表同的格式 * @return */ private String getTableHeader(){ String tableTagStart = "<table "; String tableId = "id="; String javaId = "javaId="; String tableName = "name="; String tableTagEnd = ">"; StringBuffer headBuffer = new StringBuffer(tableTagStart); headBuffer.append(tableId+"\"\" "); headBuffer.append(javaId+"\"\" "); headBuffer.append(tableName+"\"\" "); headBuffer.append(tableTagEnd+"\n"); return headBuffer.toString(); } /** * 返回table的闭合标签 * @return */ private String getEndTableTag(){ return "<\\table>"; } /** * 获取创建人的信息 * @return */ private String getRemInfo(){ String remInfo = "\t<rem>====================================================================</rem>\n" + "\t<rem> 输入人:"+ author +"\t输入时间:"+ getDate()+"</rem>\n" + "\t<rem>table description</rem>\n"+ "\t<rem>====================================================================</rem>\n"; return remInfo; } /** * 遍历获取每一列的数据 * @param table * @return */ private String getTableColumn(XWPFTable table) { String tag = "\t<column "; String id = "id="; String type = "type="; String size = "size="; String primaryKey = "primaryKey="; String required = "required="; String name = "name="; String end = " />\n"; StringBuffer tableColumsBuffer = new StringBuffer(); List<XWPFTableRow> rowList = table.getRows(); //tableStyle标记表的格式,字段和长度在一起为true,分开为false boolean tableStyle = false; tableStyle = getTableStyle(rowList.get(0).getTableCells()); for (int i = 1; i < rowList.size(); i++) { StringBuffer rowBUffer = new StringBuffer(tag); XWPFTableRow row = rowList.get(i); List<XWPFTableCell> cellList = row.getTableCells(); boolean isHasSize = false; for (int j = 0; j < cellList.size(); j++) { String cell = cellList.get(j).getText().trim().toUpperCase(); switch (j) { case 0: String newId = id + "\"" + cell + "\" "; rowBUffer.append(newId); break; case 1: if (!tableStyle){ if (cell.contains("(")) { int startIndex = cell.indexOf("("); int endIndex = cell.indexOf(")"); String cellType = cell.substring(0, startIndex); String cellSize = cell.substring(startIndex + 1, endIndex); String newType = type + "\"" + cellType + "\" "; rowBUffer.append(newType); String newSize = size + "\"" + cellSize + "\" "; rowBUffer.append(newSize); } else { String newType = type + "\"" + cell + "\" "; rowBUffer.append(newType); } }else { isHasSize = hasSize(cell); String newType = type + "\"" + cell + "\" "; rowBUffer.append(newType); } break; case 2: if (isHasSize) { String newSize = size + "\"" +cell + "\" "; rowBUffer.append(newSize); isHasSize = false; } break; case 3: String newPrimaryKey = ""; String newRequired = ""; if (cell.contains("主键")) { newPrimaryKey = primaryKey + "\"true\" "; } else { newPrimaryKey = primaryKey + "\"false\" "; } if (cell.contains("非空")) { newRequired = required + "\"true\" "; } else { newRequired = required + "\"false\" "; } rowBUffer.append(newPrimaryKey); rowBUffer.append(newRequired); break; case 4: String newName = name + "\"" + cell + "\""; rowBUffer.append(newName); rowBUffer.append(end); break; default: } } tableColumsBuffer.append(rowBUffer.toString()); } return tableColumsBuffer.toString(); } }
Docx2XMLUtilTest.java
package docx2xml; /** * @ClassName Docx2XMLUtilTest * @Author StriveFarrell * @Date 2019/12/4 16:12 * @Description * docx文档转换为xml文档的测试类 */ public class Docx2XMLUtilTest { //docx文件所在文件路径 private static final String docxFilePath = "D:\\MyFile\\workLearning\\spark\\src\\files\\docx2xml3.docx"; //生成的xml文件保存路径 private static final String xmlFileSavePath = "D:\\MyFile\\workLearning\\spark\\src\\files\\docx2xml3.xml"; //表格录入人 private static final String author = "Hello Table"; public static void main(String[] args){ Docx2XMLUtil util = new Docx2XMLUtil(); util.setDocxFilePath(docxFilePath); util.setAuthor(author); util.setXmlFileSavePath(xmlFileSavePath); util.getTableData(); } }
生成XML格式
表格式一XML
<table id="" javaId="" name="" > <rem>====================================================================</rem> <rem> 输入人:Hello Table 输入时间:2019.12.05</rem> <rem>table description</rem> <rem>====================================================================</rem> <column id="COLUMN_ID" type="VARCHAR" size="20" primaryKey="true" required="true" name="信息项定义主键" /> <column id="IS_PRIMARY_KEY" type="INT" size="2" primaryKey="false" required="true" name="表明是否为主键(0:否;1:是)" /> <column id="NOT_NULL" type="INT" size="2" primaryKey="false" required="true" name="0:可以为空;1:不可为空。" /> <column id="LENGTH" type="VARCHAR" size="11" primaryKey="false" required="true" name="长度" /> <column id="TYPE" type="VARCHAR" size="22" primaryKey="false" required="true" name="类型" /> <column id="NAME_EN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="英文名称" /> <column id="NAME_CN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="中文名称" /> <column id="CATA_ID" type="VARCHAR" size="20" primaryKey="false" required="true" name="目录ID" /> <\table>
表格式二XML
<table id="" javaId="" name="" > <rem>====================================================================</rem> <rem> 输入人:zhangqx02 输入时间:2019.12.05</rem> <rem>table description</rem> <rem>====================================================================</rem> <column id="COLUMN_ID" type="VARCHAR" size="20" primaryKey="true" required="true" name="信息项定义主键" /> <column id="IS_PRIMARY_KEY" type="INT" size="2" primaryKey="false" required="true" name="表明是否为主键(0:否;1:是)" /> <column id="NOT_NULL" type="INT" size="2" primaryKey="false" required="true" name="0:可以为空;1:不可为空。" /> <column id="LENGTH" type="VARCHAR" size="11" primaryKey="false" required="true" name="长度" /> <column id="TYPE" type="VARCHAR" size="22" primaryKey="false" required="true" name="类型" /> <column id="NAME_EN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="英文名称" /> <column id="NAME_CN" type="VARCHAR" size="2048" primaryKey="false" required="false" name="中文名称" /> <column id="CATA_ID" type="VARCHAR" size="20" primaryKey="false" required="true" name="目录ID" /> <\table>
这个表还有很多不完善的地方,比如没有生成table标签的id,javaId和name的一些字段,以后有时间在去处理。
附件
表格式一
- 下载地址:表格式一
表格式二
- 下载地址:表格式二
总结
- 将docx数据表录入到xml中,如果纯手动录入时一个枯燥头大的事情,一不小心就搞得自己眼花缭乱了。
- 实现的方式还不是很完整,还有很多可以改进的地方。