解析html片段构建xls文件

今天写了一个400行左右的模块,功能小而精,主要将html片段里面的table标签的内容解析出来,转换成xls文件,其实,程序花点时间总能写出来,不论是谁。只是
有的人花的时间少,有的人花的时间多。不过,我感觉有的人确实写不出来,比如常荣虎,比如盛丽兰,这样的技术领导者,也能称为领导?技术都难以服众,更别提其他的了。

所以,为何许多人离职,估计就是因为这些人缺乏技术领导力吧。

这个功能精髓的地方就是一个游标的定义Currsor,具有向前和向下的功能,而且为了解决路径的记忆功能,提供了fork能力,就是复制当前Curosr的功能,fork出来的Cursor继续前进,而且,这里将所有走过的路径都记录下来,这是为了能够在rowspan和colspan大于1的时候能够找到下一个没有使用的块。

这里,为了支持块的遍历,定义了Position类,使用了不可变类的思路。

虽然只有400行的程序,但感觉还是比较精彩,对面相对象的把握还是有相当的水准的。自己的面相对象理解程度感觉已经能够完成一般代码的编写了,就差集中编写类似中间件类的项目了,那才是真正锻炼面向对象能力的时候。

主要依赖jsoup,commons,jxl

<dependency>
<groupId>jxl</groupId>
<artifactId>jxl</artifactId>
<version>2.6.12</version>
</dependency>

<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.9.2</version>
</dependency>

<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.6</version>
</dependency>

<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
<version>2.6</version>
</dependency>

<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.9</version>
</dependency>

<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>

 

 

主要代码:

 

import java.io.File;
import java.io.IOException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.UUID;

import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.StringUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import jxl.Workbook;
import jxl.format.UnderlineStyle;
import jxl.write.Label;
import jxl.write.WritableCellFormat;
import jxl.write.WritableFont;
import jxl.write.WritableSheet;
import jxl.write.WritableWorkbook;
import jxl.write.WriteException;
import jxl.write.biff.RowsExceededException;

/**
* @className : HtmlTableParser
* @description : 解析html字符串,构成xls的数据,
* @author :魏广跃(1571)
* @date : 2018年7月3日 下午5:38:49
* @version V1.0
*/
public class HtmlTableToXlsParser {

private static final Logger log = LoggerFactory.getLogger(HtmlTableToXlsParser.class);

private Document document;
private Elements tableElements;
private Iterator<Element> tableIterator;
private WritableWorkbook workbook;
private WritableSheet sheet;
private File newXlsFile;


/**
* 定义单元格样式
*/
private WritableFont wf_title = new WritableFont(WritableFont.ARIAL, 11,
WritableFont.NO_BOLD, false, UnderlineStyle.NO_UNDERLINE,
jxl.format.Colour.BLACK); // 定义格式 字体 下划线 斜体 粗体 颜色

private WritableCellFormat wcf_title = new WritableCellFormat(wf_title); // 单元格定义

public HtmlTableToXlsParser(String html) {
this.document = Jsoup.parse(html);
this.tableElements = document.select("table");
this.tableIterator = this.tableElements.iterator();

File tempDir = FileUtils.getTempDirectory();
String newXlsFileName = UUID.randomUUID().toString() + ".xls";
newXlsFile = new File(tempDir, newXlsFileName);
try {
newXlsFile.createNewFile();
this.workbook = Workbook.createWorkbook(newXlsFile);
this.sheet = workbook.createSheet("default", 0);
} catch (IOException e) {
log.error("创建Workbook异常", e);
}

try {
this.initStyle();
} catch (Exception e) {
log.error("初始化样式异常", e);
}
}

private void initStyle() throws Exception{

this.wcf_title.setAlignment(jxl.format.Alignment.CENTRE); // 设置对齐方式
this.wcf_title.setBorder(jxl.format.Border.ALL, jxl.format.BorderLineStyle.THIN,jxl.format.Colour.BLACK); //设置边框
}

public File parse() {

Cursor cursor = new Cursor(this.sheet);

Element table = this.tableIterator.next();
Elements theadElements = table.select("thead");
Element threadElement = theadElements.iterator().next();
try {
this.parseThread(threadElement, cursor);
} catch (Exception e) {
log.error("解析head标签异常",e);
}

Elements trs = table.select("tbody>tr");
try {
this.parseTr(trs,cursor);
} catch (Exception e1) {
log.error("解析tr标签异常",e1);
}

try {
this.workbook.write();
this.workbook.close();
} catch (Exception e) {
log.error("写xls文件异常",e);
}

return this.newXlsFile;
}

private void parseThread(Element threadElement, Cursor cursor) throws Exception {
Elements trElements = threadElement.select("tr");
Iterator<Element> it = trElements.iterator();

while (it.hasNext()) {
Element tr = it.next();
Elements thElements = tr.select("th");

for (int k = 0; k < thElements.size(); k++) {

Element th = thElements.get(k);
String rowspanStr = th.attr("rowspan");

if (StringUtils.isBlank(rowspanStr)) {
rowspanStr = "1";
}
String colspanStr = th.attr("colspan");
if (StringUtils.isBlank(colspanStr)) {
colspanStr = "1";
}
if(rowspanStr.equals("0")) {
rowspanStr = "1";
}
if(colspanStr.equals("0")) {
colspanStr = "1";
}
int rowspan = Integer.parseInt(rowspanStr);
int colspan = Integer.parseInt(colspanStr);
String text = th.text();

// colspan和rowspan都是1的情况
if (colspan == 1 && rowspan == 1) {
String content = text;
Label label = new Label(cursor.current.col, cursor.current.row, content,this.wcf_title);
sheet.addCell(label);
sheet.setColumnView(cursor.current.col, 20);
cursor.right();
}

// colspan大于1的情况,向右复制块
if (colspan > 1) {
int startCol = cursor.current.col;
int startRow = cursor.current.row;
for (int j = 0; j < colspan; j++) {
String content = text;
if (j > 0) {
content = "";
}
Label label = new Label(cursor.current.col, cursor.current.row, content,this.wcf_title);
sheet.addCell(label);
sheet.setColumnView(cursor.current.col, 20);
cursor.right();
}
sheet.mergeCells(startCol,startRow,cursor.current.col-1, cursor.current.row);
}

if(rowspan > 1) {
// rowspan大于1的情况,向下复制块
Cursor colCursor = cursor.fork();
for (int i = 0; i < rowspan; i++) {
String content = text;
if (i > 0) {
content = "";
}
Label label = new Label(colCursor.current.col, colCursor.current.row, content,this.wcf_title);
sheet.addCell(label);
sheet.setColumnView(cursor.current.col, 20);
colCursor.down();
}
sheet.mergeCells(cursor.current.col, cursor.current.row, colCursor.current.col, colCursor.current.row-1);
cursor.right();
}

if (k == thElements.size() - 1) {
// 下一步
cursor.next();
}
}
}
}

/**
* @description : 解析tr标签集合
* @author : 魏广跃(1571)
* @date :2018年7月4日 上午9:02:22
* @param trs
* @return
* @throws WriteException 
* @throws RowsExceededException 
*/
private void parseTr(Elements trs,Cursor cursor) throws Exception {

//复位到本行开始
cursor.resetColumn();

for(int i=0;i<trs.size();i++) {
Element tr = trs.get(i);
Elements tds = tr.select("td");

for(int j=0;j<tds.size();j++) {

Element td = tds.get(j);
String text = td.text();
Label label = new Label(cursor.current.col, cursor.current.row, text,this.wcf_title);
sheet.addCell(label);
sheet.setColumnView(cursor.current.col, 20);
cursor.right();

if(j == tds.size() - 1) {
// 下一步
cursor.next();
}
}
}
}

/**
* 
* @className : Cursor
* @description : xls的cell的游标,具有向前和向下的操作,具有记忆功能,走过的路径绝不再走第二遍
* @author :魏广跃(1571)
* @date : 2018年7月4日 下午12:38:42
* @version V1.0
*/
private class Cursor {

private WritableSheet sheet;
private Position current;

private Set<Position> history = new HashSet<Position>();

public Cursor(WritableSheet sheet) {
super();
this.sheet = sheet;
this.current = new Position(0, 0);
this.history.add(current);
}

/**
* @description : 分裂cursor
* @author : 魏广跃(1571)
* @date :2018年7月4日 下午2:35:59
* @return
*/
public Cursor fork() {
Cursor fork = new Cursor(this.sheet);
fork.current = this.current;
fork.history = history;
return fork;
}

/**
* @description    : 找个没有使用的块,从下一行的第一列开始递归寻找
* @author : 魏广跃(1571)
* @date :2018年7月4日 下午3:59:35
*/
public void next() {
int newCol = 0;
int newRow = this.current.row + 1;
doNext(newCol, newRow);
}

/**
* @description    : 复位到本列的开始
* @author : 魏广跃(1571)
* @date :2018年7月4日 下午4:21:21
*/
public void resetRow() {
Position newP = new Position(this.current.col, 0);
this.current = newP;
this.history.add(this.current);
}

/**
* @description    : 复位到本行的开始
* @author : 魏广跃(1571)
* @date :2018年7月4日 下午4:19:38
*/
public void resetColumn() {
Position newP = new Position(0, this.current.row);
this.current = newP;
this.history.add(this.current);
}

/**
* @description    : 递归寻找下一个没有走过的块
* @author : 魏广跃(1571)
* @date :2018年7月4日 下午4:06:25
* @param newCol
* @param newRow
*/
private void doNext(int newCol,int newRow) {
Position newP = new Position(newCol, newRow);
if (this.history.contains(newP)) {
doNext(newCol+1,newRow);
}else {
this.current = newP;
this.history.add(current);
}
}

/**
* @description : 向下一步
* @author : 魏广跃(1571)
* @date :2018年7月4日 下午12:40:25
*/
public void down() {
int newRow = this.current.row + 1;
Position newPosition = new Position(this.current.col, newRow);
this.current = newPosition;
this.history.add(this.current);
}

/**
* @description : 向前一步
* @author : 魏广跃(1571)
* @date :2018年7月4日 下午12:41:50
*/
public void right() {
int newCol = this.current.col + 1;
Position newPosition = new Position(newCol, this.current.row);
this.current = newPosition;
this.history.add(this.current);
}

@Override
public String toString() {
return "Cursor [current=" + current + "]";
}

}

private static class Position {
final int col;
final int row;

public Position(int col, int row) {
super();
this.col = col;
this.row = row;
}

@Override
public String toString() {
return "Position [col=" + col + ", row=" + row + "]";
}

@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + col;
result = prime * result + row;
return result;
}

@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
Position other = (Position) obj;
if (col != other.col)
return false;
if (row != other.row)
return false;
return true;
}
}
}

 

测试代码:


import java.io.File;
import java.io.InputStream;

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.junit.Test;

import com.zfsoft.tjcx.html.HtmlTableToXlsParser;

public class HtmlTableParserTest {

@Test
public void test() {

InputStream inputStream = HtmlTableToXlsParser.class.getResourceAsStream("/com/zfsoft/tjcx/demo.html");
try {
byte[] arr = IOUtils.toByteArray(inputStream);
String html = new String(arr);
HtmlTableToXlsParser parser = new HtmlTableToXlsParser(html);
File f = parser.parse();
FileUtils.copyFile(f, new File("C:\\aa.xls"));
} catch (Exception e) {
e.printStackTrace();
}
}
}

 

一份测试内容:demo.html


<!DOCTYPE html>
<html>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<head>
<title>export</title>
</head>
<body>
<table class="table table-bordered table-striped table-condensed"
align="center" border="1">
<thead id="thead">
<tr id="thead$hxl">
<th rowspan="3" name="thead$hxl$xh">学号(共:5条)</th>
<th rowspan="3" name="thead$hxl$xm">姓名(共:5条)</th>
<th rowspan="3" name="thead$hxl$xydm">学院(共:5条)</th>
<th rowspan="3" name="thead$hxl$zydm">专业(共:5条)</th>
<th rowspan="3" name="thead$hxl$bjdm">班级(共:5条)</th>
<th rowspan="3" name="thead$hxl$xlccdm">学历层次(共:5条)</th>
<th colspan="6">毕业年份</th>
</tr>
<tr id="thead1">
<th name="thead1$2017" colspan="2">2017</th>
<th name="thead1$2018" colspan="2">2018</th>
<th name="thead1$2020" colspan="2">2020</th>
</tr>
<tr id="thead0">
<th name="thead0$zs$2017" colspan="1">总数</th>
<th name="thead0$bfb$zs$2017" colspan="1">比例</th>
<th name="thead0$zs$2018" colspan="1">总数</th>
<th name="thead0$bfb$zs$2018" colspan="1">比例</th>
<th name="thead0$zs$2020" colspan="1">总数</th>
<th name="thead0$bfb$zs$2020" colspan="1">比例</th>
</tr>
</thead>
<tbody id="tbody">
<tr id="20180002$孙杰$test01$test0101$test010101$03">
<td name="xh$20180002" class="tjlbleft" base-title="$xh">20180002</td>
<td name="xm$孙杰" class="tjlbleft" base-title="$xm">孙杰</td>
<td name="xydm$test01" class="tjlbleft" base-title="$xydm">人文学院</td>
<td name="zydm$test0101" class="tjlbleft" base-title="$zydm">人文学院专业</td>
<td name="bjdm$test010101" class="tjlbleft" base-title="$bjdm">人文学院1701班</td>
<td name="xlccdm$03" class="tjlbleft" base-title="$xlccdm">博士生结业
</td>
<td name="zs$2017">1</td>
<td name="bfb$zs$2017">100%</td>
<td name="zs$2018">0</td>
<td name="bfb$zs$2018">0</td>
<td name="zs$2020">0</td>
<td name="bfb$zs$2020">0</td>
</tr>
<tr id="20180004$徐晓$test05$test0301$bj002$01">
<td name="xh$20180004" class="tjlbleft" base-title="$xh">20180004</td>
<td name="xm$徐晓" class="tjlbleft" base-title="$xm">徐晓</td>
<td name="xydm$test05" class="tjlbleft" base-title="$xydm">理学院</td>
<td name="zydm$test0301" class="tjlbleft" base-title="$zydm">医学院专业</td>
<td name="bjdm$bj002" class="tjlbleft" base-title="$bjdm">测试2班</td>
<td name="xlccdm$01" class="tjlbleft" base-title="$xlccdm">博士生毕业
</td>
<td name="zs$2017">0</td>
<td name="bfb$zs$2017">0</td>
<td name="zs$2018">1</td>
<td name="bfb$zs$2018">100%</td>
<td name="zs$2020">0</td>
<td name="bfb$zs$2020">0</td>
</tr>
<tr id="20180005$何工$test04$test0501$bj002$11">
<td name="xh$20180005" class="tjlbleft" base-title="$xh">20180005</td>
<td name="xm$何工" class="tjlbleft" base-title="$xm">何工</td>
<td name="xydm$test04" class="tjlbleft" base-title="$xydm">计算机学院</td>
<td name="zydm$test0501" class="tjlbleft" base-title="$zydm">计算机学院专业</td>
<td name="bjdm$bj002" class="tjlbleft" base-title="$bjdm">测试2班</td>
<td name="xlccdm$11" class="tjlbleft" base-title="$xlccdm">硕士生毕业
</td>
<td name="zs$2017">0</td>
<td name="bfb$zs$2017">0</td>
<td name="zs$2018">0</td>
<td name="bfb$zs$2018">0</td>
<td name="zs$2020">1</td>
<td name="bfb$zs$2020">100%</td>
</tr>
<tr id="20180006$李飞$test05$test0301$test010101$26">
<td name="xh$20180006" class="tjlbleft" base-title="$xh">20180006</td>
<td name="xm$李飞" class="tjlbleft" base-title="$xm">李飞</td>
<td name="xydm$test05" class="tjlbleft" base-title="$xydm">理学院</td>
<td name="zydm$test0301" class="tjlbleft" base-title="$zydm">医学院专业</td>
<td name="bjdm$test010101" class="tjlbleft" base-title="$bjdm">人文学院1701班</td>
<td name="xlccdm$26" class="tjlbleft" base-title="$xlccdm">二学位结业
</td>
<td name="zs$2017">0</td>
<td name="bfb$zs$2017">0</td>
<td name="zs$2018">1</td>
<td name="bfb$zs$2018">100%</td>
<td name="zs$2020">0</td>
<td name="bfb$zs$2020">0</td>
</tr>
<tr id="21080003$张烧炕$test03$test0301$test010101$11">
<td name="xh$21080003" class="tjlbleft" base-title="$xh">21080003</td>
<td name="xm$张烧炕" class="tjlbleft" base-title="$xm">张烧炕</td>
<td name="xydm$test03" class="tjlbleft" base-title="$xydm">医学院</td>
<td name="zydm$test0301" class="tjlbleft" base-title="$zydm">医学院专业</td>
<td name="bjdm$test010101" class="tjlbleft" base-title="$bjdm">人文学院1701班</td>
<td name="xlccdm$11" class="tjlbleft" base-title="$xlccdm">硕士生毕业
</td>
<td name="zs$2017">0</td>
<td name="bfb$zs$2017">0</td>
<td name="zs$2018">1</td>
<td name="bfb$zs$2018">100%</td>
<td name="zs$2020">0</td>
<td name="bfb$zs$2020">0</td>
</tr>
</tbody>
</table>
</body>
</html>

另一份测试内容:demo.txt

<table class="table table-bordered table-striped table-condensed" align="center" border="1">

<thead id="thead">
<tr id="thead$hxl">
<th rowspan="0" name="thead$hxl$xh">学号(共:5条)</th><th rowspan="0" name="thead$hxl$xm">姓名(共:5条)</th>
<th rowspan="0" name="thead$hxl$bynf">毕业年份(共:5条)</th><th rowspan="0" name="thead$hxl$xydm">学院(共:5条)</th>
<th name="thead0$zs" colspan="1">总数</th></tr>
</thead>
<tbody id="tbody">
<tr id="20180002$孙杰$2017$test01">
<td name="xh$20180002" class="tjlbleft" base-title="$xh">20180002</td>
<td name="xm$孙杰" class="tjlbleft" base-title="$xm">孙杰</td>
<td name="bynf$2017" class="tjlbleft" base-title="$bynf">2017</td>
<td name="xydm$test01" class="tjlbleft" base-title="$xydm">人文学院</td>
<td name="zs">1</td></tr><tr id="20180004$徐晓$2018$test05">
<td name="xh$20180004" class="tjlbleft" base-title="$xh">20180004</td>
<td name="xm$徐晓" class="tjlbleft" base-title="$xm">徐晓</td>
<td name="bynf$2018" class="tjlbleft" base-title="$bynf">2018</td>
<td name="xydm$test05" class="tjlbleft" base-title="$xydm">理学院</td><td name="zs">1</td>
</tr>
<tr id="20180005$何工$2020$test04">
<td name="xh$20180005" class="tjlbleft" base-title="$xh">20180005</td>
<td name="xm$何工" class="tjlbleft" base-title="$xm">何工</td>
<td name="bynf$2020" class="tjlbleft" base-title="$bynf">2020</td>
<td name="xydm$test04" class="tjlbleft" base-title="$xydm">计算机学院</td>
<td name="zs">1</td></tr><tr id="20180006$李飞$2018$test05">
<td name="xh$20180006" class="tjlbleft" base-title="$xh">20180006</td>
<td name="xm$李飞" class="tjlbleft" base-title="$xm">李飞</td>
<td name="bynf$2018" class="tjlbleft" base-title="$bynf">2018</td>
<td name="xydm$test05" class="tjlbleft" base-title="$xydm">理学院</td>
<td name="zs">1</td></tr><tr id="21080003$张烧炕$2018$test03">
<td name="xh$21080003" class="tjlbleft" base-title="$xh">21080003</td>
<td name="xm$张烧炕" class="tjlbleft" base-title="$xm">张烧炕</td>
<td name="bynf$2018" class="tjlbleft" base-title="$bynf">2018</td>
<td name="xydm$test03" class="tjlbleft" base-title="$xydm">医学院</td>
<td name="zs">1</td>
</tr>
</tbody>
</table>

转载于:https://www.cnblogs.com/weiguangyue/p/9265880.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值