今天写了一个400行左右的模块,功能小而精,主要将html片段里面的table标签的内容解析出来,转换成xls文件,其实,程序花点时间总能写出来,不论是谁。只是
有的人花的时间少,有的人花的时间多。不过,我感觉有的人确实写不出来,比如常荣虎,比如盛丽兰,这样的技术领导者,也能称为领导?技术都难以服众,更别提其他的了。
所以,为何许多人离职,估计就是因为这些人缺乏技术领导力吧。
这个功能精髓的地方就是一个游标的定义Currsor,具有向前和向下的功能,而且为了解决路径的记忆功能,提供了fork能力,就是复制当前Curosr的功能,fork出来的Cursor继续前进,而且,这里将所有走过的路径都记录下来,这是为了能够在rowspan和colspan大于1的时候能够找到下一个没有使用的块。
这里,为了支持块的遍历,定义了Position类,使用了不可变类的思路。
虽然只有400行的程序,但感觉还是比较精彩,对面相对象的把握还是有相当的水准的。自己的面相对象理解程度感觉已经能够完成一般代码的编写了,就差集中编写类似中间件类的项目了,那才是真正锻炼面向对象能力的时候。
主要依赖jsoup,commons,jxl
<dependency>
<groupId>jxl</groupId>
<artifactId>jxl</artifactId>
<version>2.6.12</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.9</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
主要代码:
import java.io.File; import java.io.IOException; import java.util.HashSet; import java.util.Iterator; import java.util.Set; import java.util.UUID; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import jxl.Workbook; import jxl.format.UnderlineStyle; import jxl.write.Label; import jxl.write.WritableCellFormat; import jxl.write.WritableFont; import jxl.write.WritableSheet; import jxl.write.WritableWorkbook; import jxl.write.WriteException; import jxl.write.biff.RowsExceededException; /** * @className : HtmlTableParser * @description : 解析html字符串,构成xls的数据, * @author :魏广跃(1571) * @date : 2018年7月3日 下午5:38:49 * @version V1.0 */ public class HtmlTableToXlsParser { private static final Logger log = LoggerFactory.getLogger(HtmlTableToXlsParser.class); private Document document; private Elements tableElements; private Iterator<Element> tableIterator; private WritableWorkbook workbook; private WritableSheet sheet; private File newXlsFile; /** * 定义单元格样式 */ private WritableFont wf_title = new WritableFont(WritableFont.ARIAL, 11, WritableFont.NO_BOLD, false, UnderlineStyle.NO_UNDERLINE, jxl.format.Colour.BLACK); // 定义格式 字体 下划线 斜体 粗体 颜色 private WritableCellFormat wcf_title = new WritableCellFormat(wf_title); // 单元格定义 public HtmlTableToXlsParser(String html) { this.document = Jsoup.parse(html); this.tableElements = document.select("table"); this.tableIterator = this.tableElements.iterator(); File tempDir = FileUtils.getTempDirectory(); String newXlsFileName = UUID.randomUUID().toString() + ".xls"; newXlsFile = new File(tempDir, newXlsFileName); try { newXlsFile.createNewFile(); this.workbook = Workbook.createWorkbook(newXlsFile); this.sheet = workbook.createSheet("default", 0); } catch (IOException e) { log.error("创建Workbook异常", e); } try { this.initStyle(); } catch (Exception e) { log.error("初始化样式异常", e); } } private void initStyle() throws Exception{ this.wcf_title.setAlignment(jxl.format.Alignment.CENTRE); // 设置对齐方式 this.wcf_title.setBorder(jxl.format.Border.ALL, jxl.format.BorderLineStyle.THIN,jxl.format.Colour.BLACK); //设置边框 } public File parse() { Cursor cursor = new Cursor(this.sheet); Element table = this.tableIterator.next(); Elements theadElements = table.select("thead"); Element threadElement = theadElements.iterator().next(); try { this.parseThread(threadElement, cursor); } catch (Exception e) { log.error("解析head标签异常",e); } Elements trs = table.select("tbody>tr"); try { this.parseTr(trs,cursor); } catch (Exception e1) { log.error("解析tr标签异常",e1); } try { this.workbook.write(); this.workbook.close(); } catch (Exception e) { log.error("写xls文件异常",e); } return this.newXlsFile; } private void parseThread(Element threadElement, Cursor cursor) throws Exception { Elements trElements = threadElement.select("tr"); Iterator<Element> it = trElements.iterator(); while (it.hasNext()) { Element tr = it.next(); Elements thElements = tr.select("th"); for (int k = 0; k < thElements.size(); k++) { Element th = thElements.get(k); String rowspanStr = th.attr("rowspan"); if (StringUtils.isBlank(rowspanStr)) { rowspanStr = "1"; } String colspanStr = th.attr("colspan"); if (StringUtils.isBlank(colspanStr)) { colspanStr = "1"; } if(rowspanStr.equals("0")) { rowspanStr = "1"; } if(colspanStr.equals("0")) { colspanStr = "1"; } int rowspan = Integer.parseInt(rowspanStr); int colspan = Integer.parseInt(colspanStr); String text = th.text(); // colspan和rowspan都是1的情况 if (colspan == 1 && rowspan == 1) { String content = text; Label label = new Label(cursor.current.col, cursor.current.row, content,this.wcf_title); sheet.addCell(label); sheet.setColumnView(cursor.current.col, 20); cursor.right(); } // colspan大于1的情况,向右复制块 if (colspan > 1) { int startCol = cursor.current.col; int startRow = cursor.current.row; for (int j = 0; j < colspan; j++) { String content = text; if (j > 0) { content = ""; } Label label = new Label(cursor.current.col, cursor.current.row, content,this.wcf_title); sheet.addCell(label); sheet.setColumnView(cursor.current.col, 20); cursor.right(); } sheet.mergeCells(startCol,startRow,cursor.current.col-1, cursor.current.row); } if(rowspan > 1) { // rowspan大于1的情况,向下复制块 Cursor colCursor = cursor.fork(); for (int i = 0; i < rowspan; i++) { String content = text; if (i > 0) { content = ""; } Label label = new Label(colCursor.current.col, colCursor.current.row, content,this.wcf_title); sheet.addCell(label); sheet.setColumnView(cursor.current.col, 20); colCursor.down(); } sheet.mergeCells(cursor.current.col, cursor.current.row, colCursor.current.col, colCursor.current.row-1); cursor.right(); } if (k == thElements.size() - 1) { // 下一步 cursor.next(); } } } } /** * @description : 解析tr标签集合 * @author : 魏广跃(1571) * @date :2018年7月4日 上午9:02:22 * @param trs * @return * @throws WriteException * @throws RowsExceededException */ private void parseTr(Elements trs,Cursor cursor) throws Exception { //复位到本行开始 cursor.resetColumn(); for(int i=0;i<trs.size();i++) { Element tr = trs.get(i); Elements tds = tr.select("td"); for(int j=0;j<tds.size();j++) { Element td = tds.get(j); String text = td.text(); Label label = new Label(cursor.current.col, cursor.current.row, text,this.wcf_title); sheet.addCell(label); sheet.setColumnView(cursor.current.col, 20); cursor.right(); if(j == tds.size() - 1) { // 下一步 cursor.next(); } } } } /** * * @className : Cursor * @description : xls的cell的游标,具有向前和向下的操作,具有记忆功能,走过的路径绝不再走第二遍 * @author :魏广跃(1571) * @date : 2018年7月4日 下午12:38:42 * @version V1.0 */ private class Cursor { private WritableSheet sheet; private Position current; private Set<Position> history = new HashSet<Position>(); public Cursor(WritableSheet sheet) { super(); this.sheet = sheet; this.current = new Position(0, 0); this.history.add(current); } /** * @description : 分裂cursor * @author : 魏广跃(1571) * @date :2018年7月4日 下午2:35:59 * @return */ public Cursor fork() { Cursor fork = new Cursor(this.sheet); fork.current = this.current; fork.history = history; return fork; } /** * @description : 找个没有使用的块,从下一行的第一列开始递归寻找 * @author : 魏广跃(1571) * @date :2018年7月4日 下午3:59:35 */ public void next() { int newCol = 0; int newRow = this.current.row + 1; doNext(newCol, newRow); } /** * @description : 复位到本列的开始 * @author : 魏广跃(1571) * @date :2018年7月4日 下午4:21:21 */ public void resetRow() { Position newP = new Position(this.current.col, 0); this.current = newP; this.history.add(this.current); } /** * @description : 复位到本行的开始 * @author : 魏广跃(1571) * @date :2018年7月4日 下午4:19:38 */ public void resetColumn() { Position newP = new Position(0, this.current.row); this.current = newP; this.history.add(this.current); } /** * @description : 递归寻找下一个没有走过的块 * @author : 魏广跃(1571) * @date :2018年7月4日 下午4:06:25 * @param newCol * @param newRow */ private void doNext(int newCol,int newRow) { Position newP = new Position(newCol, newRow); if (this.history.contains(newP)) { doNext(newCol+1,newRow); }else { this.current = newP; this.history.add(current); } } /** * @description : 向下一步 * @author : 魏广跃(1571) * @date :2018年7月4日 下午12:40:25 */ public void down() { int newRow = this.current.row + 1; Position newPosition = new Position(this.current.col, newRow); this.current = newPosition; this.history.add(this.current); } /** * @description : 向前一步 * @author : 魏广跃(1571) * @date :2018年7月4日 下午12:41:50 */ public void right() { int newCol = this.current.col + 1; Position newPosition = new Position(newCol, this.current.row); this.current = newPosition; this.history.add(this.current); } @Override public String toString() { return "Cursor [current=" + current + "]"; } } private static class Position { final int col; final int row; public Position(int col, int row) { super(); this.col = col; this.row = row; } @Override public String toString() { return "Position [col=" + col + ", row=" + row + "]"; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + col; result = prime * result + row; return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; Position other = (Position) obj; if (col != other.col) return false; if (row != other.row) return false; return true; } } } 测试代码: import java.io.File; import java.io.InputStream; import org.apache.commons.io.FileUtils; import org.apache.commons.io.IOUtils; import org.junit.Test; import com.zfsoft.tjcx.html.HtmlTableToXlsParser; public class HtmlTableParserTest { @Test public void test() { InputStream inputStream = HtmlTableToXlsParser.class.getResourceAsStream("/com/zfsoft/tjcx/demo.html"); try { byte[] arr = IOUtils.toByteArray(inputStream); String html = new String(arr); HtmlTableToXlsParser parser = new HtmlTableToXlsParser(html); File f = parser.parse(); FileUtils.copyFile(f, new File("C:\\aa.xls")); } catch (Exception e) { e.printStackTrace(); } } }
一份测试内容:demo.html
<!DOCTYPE html>
<html>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<head>
<title>export</title>
</head>
<body>
<table class="table table-bordered table-striped table-condensed"
align="center" border="1">
<thead id="thead">
<tr id="thead$hxl">
<th rowspan="3" name="thead$hxl$xh">学号(共:5条)</th>
<th rowspan="3" name="thead$hxl$xm">姓名(共:5条)</th>
<th rowspan="3" name="thead$hxl$xydm">学院(共:5条)</th>
<th rowspan="3" name="thead$hxl$zydm">专业(共:5条)</th>
<th rowspan="3" name="thead$hxl$bjdm">班级(共:5条)</th>
<th rowspan="3" name="thead$hxl$xlccdm">学历层次(共:5条)</th>
<th colspan="6">毕业年份</th>
</tr>
<tr id="thead1">
<th name="thead1$2017" colspan="2">2017</th>
<th name="thead1$2018" colspan="2">2018</th>
<th name="thead1$2020" colspan="2">2020</th>
</tr>
<tr id="thead0">
<th name="thead0$zs$2017" colspan="1">总数</th>
<th name="thead0$bfb$zs$2017" colspan="1">比例</th>
<th name="thead0$zs$2018" colspan="1">总数</th>
<th name="thead0$bfb$zs$2018" colspan="1">比例</th>
<th name="thead0$zs$2020" colspan="1">总数</th>
<th name="thead0$bfb$zs$2020" colspan="1">比例</th>
</tr>
</thead>
<tbody id="tbody">
<tr id="20180002$孙杰$test01$test0101$test010101$03">
<td name="xh$20180002" class="tjlbleft" base-title="$xh">20180002</td>
<td name="xm$孙杰" class="tjlbleft" base-title="$xm">孙杰</td>
<td name="xydm$test01" class="tjlbleft" base-title="$xydm">人文学院</td>
<td name="zydm$test0101" class="tjlbleft" base-title="$zydm">人文学院专业</td>
<td name="bjdm$test010101" class="tjlbleft" base-title="$bjdm">人文学院1701班</td>
<td name="xlccdm$03" class="tjlbleft" base-title="$xlccdm">博士生结业
</td>
<td name="zs$2017">1</td>
<td name="bfb$zs$2017">100%</td>
<td name="zs$2018">0</td>
<td name="bfb$zs$2018">0</td>
<td name="zs$2020">0</td>
<td name="bfb$zs$2020">0</td>
</tr>
<tr id="20180004$徐晓$test05$test0301$bj002$01">
<td name="xh$20180004" class="tjlbleft" base-title="$xh">20180004</td>
<td name="xm$徐晓" class="tjlbleft" base-title="$xm">徐晓</td>
<td name="xydm$test05" class="tjlbleft" base-title="$xydm">理学院</td>
<td name="zydm$test0301" class="tjlbleft" base-title="$zydm">医学院专业</td>
<td name="bjdm$bj002" class="tjlbleft" base-title="$bjdm">测试2班</td>
<td name="xlccdm$01" class="tjlbleft" base-title="$xlccdm">博士生毕业
</td>
<td name="zs$2017">0</td>
<td name="bfb$zs$2017">0</td>
<td name="zs$2018">1</td>
<td name="bfb$zs$2018">100%</td>
<td name="zs$2020">0</td>
<td name="bfb$zs$2020">0</td>
</tr>
<tr id="20180005$何工$test04$test0501$bj002$11">
<td name="xh$20180005" class="tjlbleft" base-title="$xh">20180005</td>
<td name="xm$何工" class="tjlbleft" base-title="$xm">何工</td>
<td name="xydm$test04" class="tjlbleft" base-title="$xydm">计算机学院</td>
<td name="zydm$test0501" class="tjlbleft" base-title="$zydm">计算机学院专业</td>
<td name="bjdm$bj002" class="tjlbleft" base-title="$bjdm">测试2班</td>
<td name="xlccdm$11" class="tjlbleft" base-title="$xlccdm">硕士生毕业
</td>
<td name="zs$2017">0</td>
<td name="bfb$zs$2017">0</td>
<td name="zs$2018">0</td>
<td name="bfb$zs$2018">0</td>
<td name="zs$2020">1</td>
<td name="bfb$zs$2020">100%</td>
</tr>
<tr id="20180006$李飞$test05$test0301$test010101$26">
<td name="xh$20180006" class="tjlbleft" base-title="$xh">20180006</td>
<td name="xm$李飞" class="tjlbleft" base-title="$xm">李飞</td>
<td name="xydm$test05" class="tjlbleft" base-title="$xydm">理学院</td>
<td name="zydm$test0301" class="tjlbleft" base-title="$zydm">医学院专业</td>
<td name="bjdm$test010101" class="tjlbleft" base-title="$bjdm">人文学院1701班</td>
<td name="xlccdm$26" class="tjlbleft" base-title="$xlccdm">二学位结业
</td>
<td name="zs$2017">0</td>
<td name="bfb$zs$2017">0</td>
<td name="zs$2018">1</td>
<td name="bfb$zs$2018">100%</td>
<td name="zs$2020">0</td>
<td name="bfb$zs$2020">0</td>
</tr>
<tr id="21080003$张烧炕$test03$test0301$test010101$11">
<td name="xh$21080003" class="tjlbleft" base-title="$xh">21080003</td>
<td name="xm$张烧炕" class="tjlbleft" base-title="$xm">张烧炕</td>
<td name="xydm$test03" class="tjlbleft" base-title="$xydm">医学院</td>
<td name="zydm$test0301" class="tjlbleft" base-title="$zydm">医学院专业</td>
<td name="bjdm$test010101" class="tjlbleft" base-title="$bjdm">人文学院1701班</td>
<td name="xlccdm$11" class="tjlbleft" base-title="$xlccdm">硕士生毕业
</td>
<td name="zs$2017">0</td>
<td name="bfb$zs$2017">0</td>
<td name="zs$2018">1</td>
<td name="bfb$zs$2018">100%</td>
<td name="zs$2020">0</td>
<td name="bfb$zs$2020">0</td>
</tr>
</tbody>
</table>
</body>
</html>
另一份测试内容:demo.txt
<table class="table table-bordered table-striped table-condensed" align="center" border="1">
<thead id="thead">
<tr id="thead$hxl">
<th rowspan="0" name="thead$hxl$xh">学号(共:5条)</th><th rowspan="0" name="thead$hxl$xm">姓名(共:5条)</th>
<th rowspan="0" name="thead$hxl$bynf">毕业年份(共:5条)</th><th rowspan="0" name="thead$hxl$xydm">学院(共:5条)</th>
<th name="thead0$zs" colspan="1">总数</th></tr>
</thead>
<tbody id="tbody">
<tr id="20180002$孙杰$2017$test01">
<td name="xh$20180002" class="tjlbleft" base-title="$xh">20180002</td>
<td name="xm$孙杰" class="tjlbleft" base-title="$xm">孙杰</td>
<td name="bynf$2017" class="tjlbleft" base-title="$bynf">2017</td>
<td name="xydm$test01" class="tjlbleft" base-title="$xydm">人文学院</td>
<td name="zs">1</td></tr><tr id="20180004$徐晓$2018$test05">
<td name="xh$20180004" class="tjlbleft" base-title="$xh">20180004</td>
<td name="xm$徐晓" class="tjlbleft" base-title="$xm">徐晓</td>
<td name="bynf$2018" class="tjlbleft" base-title="$bynf">2018</td>
<td name="xydm$test05" class="tjlbleft" base-title="$xydm">理学院</td><td name="zs">1</td>
</tr>
<tr id="20180005$何工$2020$test04">
<td name="xh$20180005" class="tjlbleft" base-title="$xh">20180005</td>
<td name="xm$何工" class="tjlbleft" base-title="$xm">何工</td>
<td name="bynf$2020" class="tjlbleft" base-title="$bynf">2020</td>
<td name="xydm$test04" class="tjlbleft" base-title="$xydm">计算机学院</td>
<td name="zs">1</td></tr><tr id="20180006$李飞$2018$test05">
<td name="xh$20180006" class="tjlbleft" base-title="$xh">20180006</td>
<td name="xm$李飞" class="tjlbleft" base-title="$xm">李飞</td>
<td name="bynf$2018" class="tjlbleft" base-title="$bynf">2018</td>
<td name="xydm$test05" class="tjlbleft" base-title="$xydm">理学院</td>
<td name="zs">1</td></tr><tr id="21080003$张烧炕$2018$test03">
<td name="xh$21080003" class="tjlbleft" base-title="$xh">21080003</td>
<td name="xm$张烧炕" class="tjlbleft" base-title="$xm">张烧炕</td>
<td name="bynf$2018" class="tjlbleft" base-title="$bynf">2018</td>
<td name="xydm$test03" class="tjlbleft" base-title="$xydm">医学院</td>
<td name="zs">1</td>
</tr>
</tbody>
</table>