import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;
import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.Bullet;
import org.htmlparser.tags.BulletList;
import org.htmlparser.tags.HeadingTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
/**
* Html文件处理接口
*
*/
public class HtmlParser {
private String path = "file:///C:\\temp\\oAAAAam\\_table.html";
private String content = "";
private String zhCode = "GBK"; // 中文编码方式默认GBK,可手动修改为utf-8,utf-16
// private String[] SimpleStruct = new
// String[]{"BYTE","WORD16","WORD32","DWORD"};
private ArrayList<String> SimpleStruct = new ArrayList<String>();
public HtmlParser() {
try {
build();
} catch (Exception e) {
// TODO: handle exception
}
}
public HtmlParser(String path) {
if (!path.startsWith("file:"))
this.path = "file:///" + path;
else
this.path = path;
try {
build();
} catch (Exception e) {
// TODO: handle exception
}
}
public HtmlParser(String path, String code) {
if (!path.startsWith("file:"))
this.path = "file:///" + path;
else
this.path = path;
this.zhCode = code;
try {
build();
} catch (Exception e) {
// TODO: handle exception
}
}
private void initialArray() {
SimpleStruct.add("BYTE");
SimpleStruct.add("WORD16");
SimpleStruct.add("WORD32");
SimpleStruct.add("DWORD");
}
/**
* 初始构建方法,适用于所有HTML类型文件
*
* @throws Exception
*/
private void build() throws Exception {
initialArray();
URL url = new URL(path);
URLConnection conn = url.openConnection();
conn.setDoOutput(true);
InputStream inputStream = conn.getInputStream();
InputStreamReader isr = new InputStreamReader(inputStream, zhCode);
StringBuffer sb = new StringBuffer();
BufferedReader in = new BufferedReader(isr);
String inputLine;
while ((inputLine = in.readLine()) != null) {
sb.append(inputLine);
sb.append("\n");
}
this.content = sb.toString();
}
/**
* BYTE:Other:BYTE @ WORD:Other:WORD16 @ Other:WORD32 @
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
HtmlParser hp = new HtmlParser();
String tblname = "DSAAA_SDH_A";
String name = hp.getLinkByTblName(tblname)[1];
if (name == null) {
return;
}
if (name.equals("1")) // 如果返回1,认为附加信息为"无"
{
System.out.println("name: " + "无");
System.out.println("type: " + "无");
} else {
System.out.println("name: " + name);
String type = hp.getTypeByTblName(tblname);
System.out.println("type: " + type);
// 获取字段类型表值
String path2 = "file:///C:\\temp\\oaaaaaam\\" + name;
HtmlParser hp2 = new HtmlParser(path2);
/** test 数据成员 **/
TableTag tblnode = hp2.getArrayTableNode("数据成员");
ArrayList<String[]> arraylist = hp2.getAttrsList(tblnode);
Iterator<String[]> it = arraylist.iterator();
while (it.hasNext()) {
String[] tmp = it.next();
String strs = "";
for (String str : tmp)
strs += str + ":";
System.out.println("str: " + strs);
}
/** ----test 数据成员 **/
/** test Div Node **/
// Node divNode = hp2
// .getDivByTdName("struct_t___s_d_h_abnormal.html#058c411ea007611138efe6d89f6ba9c5");
// ArrayList<Node[]> listNodeArray = hp2.getBulList(divNode);
// ArrayList<String[]> enumArray = hp2.getEnumList(listNodeArray);
// ArrayList<String[]> enumTypeArray = hp2
// .getEnumTypeList(listNodeArray);
/** ---test Div Node **/
// ArrayList<String[]> attrlist = hp2.getAttrsList();
// Iterator<String[]> it = attrlist.iterator();
// while(it.hasNext())
// {
// String[] tmp = it.next();
// for(String str:tmp)
// System.out.println("str: "+str);
// }
}
}
public ArrayList<String[]> getAttrArrayList(String macroname)
throws Exception {
// 获取告警宏表值
String tblname = macroname;
String[] fullTblStr = getLinkByTblName(tblname);
if (fullTblStr == null) {
return null;
}
String name = fullTblStr[1];
if (name.equals("1")) // 如果返回1,认为附加信息为"无"
{
System.out.println("name: " + "无");
System.out.println("type: " + "无");
} else {
System.out.println("name: " + name);
String type = getTypeByTblName(tblname);
System.out.println("type: " + type);
// 获取字段类型表值
String path2 = "file:///" + AConst.usrpath
+ "files\\ChmFiles\\oaaam\\" + name;
HtmlParser hp2 = new HtmlParser(path2);
/** test 数据成员 SimpleStruct **/
if (SimpleStruct.contains(type))
{
ArrayList<String[]> arraylist = getSimpleList(macroname,
fullTblStr[0], type);
return arraylist;
} else {
TableTag tblnode = hp2.getArrayTableNode("数据成员");
ArrayList<String[]> arraylist = hp2.getAttrsList(tblnode);
return arraylist;
}
}
return null;
}
private ArrayList<String[]> getSimpleList(String macroname, String fulllnk,
String type) throws Exception {
ArrayList<String[]> al = new ArrayList<String[]>();
String[] simpleArray = new String[3];
String lnkstr = fulllnk.split("@Link:")[1];
lnkstr = lnkstr.substring(0, lnkstr.indexOf("@"));
simpleArray[2] = lnkstr;
String namestr = fulllnk.split("@Link:")[0];
String name = namestr.substring(namestr.indexOf("::") + 2);
simpleArray[0] = type;
simpleArray[1] = name;
al.add(simpleArray);
return al;
}
public String getHrefName(String macroname) throws Exception {
String tblname = macroname;
String name = getLinkByTblName(tblname)[1];
return name;
}
/**
* 获取字段文档映射信息
*
* @param path
* @param hrefname
* @return
* @throws ParserException
*/
public ArrayList<String[]> getEnumTypeList(String path, String hrefname)
throws ParserException {
String path2 = "file:///" + AConst.usrpath
+ "files\\ChmFiles\\oaaaam\\" + path;
HtmlParser hp2 = new HtmlParser(path2);
/** test Div Node **/
Node divNode = hp2.getDivByTdName(hrefname);
ArrayList<Node[]> listNodeArray = hp2.getBulList(divNode);
// 处理字段详细对应信息
ArrayList<String[]> enumTypeArray = hp2.getEnumTypeList(listNodeArray);
return enumTypeArray;
}
/**
* 获取字段文档中ENUM数据表
*
* @param path
* 通过宏名获取到字段文档的文件名称
* @param hrefname
* 字段内DIV的HREF链接名称
* @return
* @throws ParserException
*/
public ArrayList<String[]> getEnumArrayList(String path, String hrefname)
throws ParserException {
String path2 = "file:///" + AConst.usrpath
+ "files\\ChmFiles\\oaaam\\" + path;
HtmlParser hp2 = new HtmlParser(path2);
/** test Div Node **/
Node divNode = hp2.getDivByTdName(hrefname);
ArrayList<Node[]> listNodeArray = hp2.getBulList(divNode);
ArrayList<String[]> enumArray = hp2.getEnumList(listNodeArray);
return enumArray;
}
private int getZhHead(TableTag tblNode) {
Node rowNode = tblNode.childAt(1);
if (rowNode instanceof TableRow) {
TableRow rownode = (TableRow) rowNode;
for (int j = 0; j < rownode.getChildCount(); j++) {
Node colNode = rownode.childAt(j);
if (colNode instanceof TableColumn) {
TableColumn colnode = (TableColumn) colNode;
Node childNode = colnode.getFirstChild();
String head = childNode.getText();
if (head.contains("中文"))
return j;
}
}
}
return -1;
}
private int getEnHead(TableTag tblNode) {
Node rowNode = tblNode.childAt(1);
if (rowNode instanceof TableRow) {
TableRow rownode = (TableRow) rowNode;
for (int j = 0; j < rownode.getChildCount(); j++) {
Node colNode = rownode.childAt(j);
if (colNode instanceof TableColumn) {
TableColumn colnode = (TableColumn) colNode;
Node childNode = colnode.getFirstChild();
String head = childNode.getText();
if (head.contains("英文"))
return j;
}
}
}
return -1;
}
/**
* 获取指定DIV模块中的ENUM数组列表
*
* @param listNodeArray
* @return
*/
private ArrayList<String[]> getEnumList(ArrayList<Node[]> listNodeArray) {
Iterator<Node[]> it = listNodeArray.iterator();
int zhHead = 0;
int enHead = 0;
while (it.hasNext()) {
Node[] nodes = it.next();
if (nodes[2] != null) {
TableTag tblNode = (TableTag) nodes[2];
ArrayList<String[]> enumlist = new ArrayList<String[]>();
for (int i = 2; i < tblNode.getChildCount(); i++) {
zhHead = getZhHead(tblNode);
enHead = getEnHead(tblNode);
Node rowNode = tblNode.childAt(i);
if (rowNode instanceof TableRow) {
TableRow rownode = (TableRow) rowNode;
String[] values = new String[3];
int index = 0;
for (int j = 0; j < rownode.getChildCount(); j++) {
// if(rownode.getColumnCount()>3)
// {
// Node colNode = rownode.childAt(j);
// if(colNode instanceof TableColumn)
// {
// TableColumn colnode = (TableColumn)colNode;
// Node childNode = colnode.getFirstChild();
// if(childNode instanceof TagNode)
// {
// TagNode tnode = (TagNode)childNode;
// values[index++] = tnode.getText().trim();
// }
// else if(childNode!=null&&index<3)
// {
// TextNode cnode = (TextNode)childNode;
// values[index++] = cnode.getText().trim();
// }
// else if(index<3)
// {
// values[index++] = "Null";
// }
// else
// {
// print("index is: "+index);
// }
//
// Node zhcolNode = rownode.childAt(zhHead);
// if(zhcolNode instanceof TableColumn)
// {
// TableColumn zhcol = (TableColumn)zhcolNode;
// values[1] = zhcol.getFirstChild().getText();
// }
// Node encolNode = rownode.childAt(enHead);
// if(encolNode instanceof TableColumn)
// {
// TableColumn encol = (TableColumn)encolNode;
// values[2] = encol.getFirstChild().getText();
// }
// }
// }
// else
// {
Node colNode = rownode.childAt(j);
if (colNode instanceof TableColumn) {
TableColumn colnode = (TableColumn) colNode;
Node childNode = colnode.getFirstChild();
if (childNode instanceof TagNode && index < 3) {
TagNode tnode = (TagNode) childNode;
values[index++] = tnode.getText().trim();
} else if (childNode != null && index < 3) {
TextNode cnode = (TextNode) childNode;
values[index++] = cnode.getText().trim();
} else if (index < 3) {
values[index++] = "Null";
}
}
// }
}
if (values != null)
enumlist.add(values);
}
}
return enumlist;
}
}
return null;
}
/**
* 获取字段文档中属性关系LIST
*
* @param listNodeArray
* @return
*/
private ArrayList<String[]> getEnumTypeList(ArrayList<Node[]> listNodeArray) {
Iterator<Node[]> it = listNodeArray.iterator();
ArrayList<String[]> enumlist = new ArrayList<String[]>();
while (it.hasNext()) {
String[] typeArrays = new String[2];
Node[] nodes = it.next();
Node nameNode = nodes[0];
if (nameNode instanceof TextNode)
typeArrays[0] = nameNode.getText().trim();
Node valueNode = nodes[1];
if (valueNode instanceof TextNode)
typeArrays[1] = valueNode.getText().trim();
enumlist.add(typeArrays);
}
return enumlist;
}
/**
* 在TABLE中根据0列的NAME属性值,获取得到第2列的LINK文件名值
*
* @param name
* @return
*/
public String[] getLinkByTblName(String name) throws Exception {
String[] tmp = new String[2];
int rowno = getTblNameNo(name);
if (rowno < 0) {
return null;
}
String lnkvalue = getTableValue(rowno, 2);
tmp[0] = lnkvalue;
if (lnkvalue == null)
return null;
if (!lnkvalue.contains("Link")) {
tmp[1] = "1";
return tmp;
}
if (!lnkvalue.contains("#"))
lnkvalue = lnkvalue.substring(
lnkvalue.indexOf("Link:") + "Link:".length(),
lnkvalue.indexOf("html") + "html".length());
else
lnkvalue = lnkvalue.substring(
lnkvalue.indexOf("Link:") + "Link:".length(),
lnkvalue.indexOf("#"));
tmp[1] = lnkvalue;
return tmp;
}
/**
* 根据名称获取分布页面对应宏名称的段类型
*
* @param name
* @return
* @throws Exception
*/
public String getTypeByTblName(String name) throws Exception {
int rowno = getTblNameNo(name);
String lnkvalue = getTableValue(rowno, 2);
if (lnkvalue == null || !lnkvalue.contains("Link"))
return null;
lnkvalue = lnkvalue.substring(lnkvalue.indexOf("Other:")
+ "Other:".length());
lnkvalue = lnkvalue.substring(0, lnkvalue.indexOf("@"));
return lnkvalue.trim();
}
/**
* 通过TD字节名称取到DIV字节属性块
*
* @param name
* @throws ParserException
*/
private Node getDivByTdName(String name) throws ParserException {
Parser parser;
NodeList nodelist;
parser = Parser.createParser(content, zhCode);
NodeFilter divFilter = new TagNameFilter("div");
NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
nodelist = parser.extractAllNodesThatMatch(divFilter);
Node[] nodes = nodelist.toNodeArray();
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
NodeList inDivNodelist = node.getChildren();
NodeList tblNodeList = inDivNodelist
.extractAllNodesThatMatch(tableFilter);
for (int m = 0; m < tblNodeList.size(); m++) {
Node tblNode = tblNodeList.elementAt(m);
TableTag tablenode = (TableTag) tblNode;
for (int n = 0; n < tablenode.getChildCount(); n++) {
Node trnode = tablenode.childAt(n);
if (trnode instanceof TableRow) {
TableRow trNode = (TableRow) trnode;
for (int a = 0; a < trNode.getChildCount(); a++) {
Node colNode = trNode.childAt(a);
if (colNode instanceof TableColumn) {
TableColumn colnode = (TableColumn) colNode;
for (int b = 0; b < colnode.getChildCount(); b++) {
Node attrNode = colnode.childAt(b);
if (attrNode instanceof LinkTag) {
LinkTag hrefNode = (LinkTag) attrNode;
String href = hrefNode.getLink();
if (name.equalsIgnoreCase(href))
return nodes[i + 1];
}
}
}
}
} else
continue;
}
}
}
return null;
}
/**
* 在指定DIV结构中获取参数列表和ENUM列表NODE
*
* @param node
* @return
*/
private ArrayList<Node[]> getBulList(Node node) {
NodeFilter listFilter = new NodeClassFilter(BulletList.class);
NodeList inDivNodelist = node.getChildren();
NodeList blNodeList = inDivNodelist
.extractAllNodesThatMatch(listFilter);
for (int i = 0; i < blNodeList.size(); i++) {
Node liNode = blNodeList.elementAt(i);
if (liNode instanceof BulletList) {
ArrayList<Node[]> blArrayList = new ArrayList<Node[]>();
BulletList blist = (BulletList) liNode;
for (int j = 0; j < blist.getChildCount(); j++) {
Node inLiNode = blist.getChild(j);
if (inLiNode instanceof Bullet) {
Bullet bulnode = (Bullet) inLiNode;
Node[] attrs = new Node[3];
int index = 0;
for (int n = 0; n < bulnode.getChildCount(); n++) {
Node trnode = bulnode.childAt(n);
if ((trnode instanceof TextNode) && index < 2) {
TextNode trNode = (TextNode) trnode;
attrs[index++] = trNode;
} else if (trnode instanceof TableTag) {
TableTag tTag = (TableTag) trnode;
attrs[2] = tTag;
} else
continue;
}
blArrayList.add(attrs);
}
}
return blArrayList;
}
}
return null;
}
/**
* 获取指定名称的NODE在TABLE中的行号
*
* @param name
* @return
* @throws Exception
*/
public int getTblNameNo(String name) throws Exception {
Parser parser;
NodeList nodelist;
parser = Parser.createParser(content, zhCode);
NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { tableFilter });
nodelist = parser.parse(lastFilter);
Node[] nodes = nodelist.toNodeArray();
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TableTag) {
TableTag tablenode = (TableTag) node;
int count = tablenode.getRowCount();
for (int m = 0; m < count; m++) {
TableRow trow = tablenode.getRow(m);
TableColumn[] tcolumns = trow.getColumns();
TableColumn tcolumn = tcolumns[0];
Node[] colnodes = tcolumn.getChildrenAsNodeArray();
for (Node colHref : colnodes) {
if (colHref instanceof LinkTag)
{
LinkTag link = (LinkTag) colHref;
String tmpname = link.getLinkText();
if (tmpname.equalsIgnoreCase(name)) // 判断当前0列的单元格名称是否与入参一致
return m;
}
}
}
}
}
return -1;
}
/**
* 获取指定行、列的TABLE的内容字符串,单元格内部各类型值之间以@分隔
*
* @param row
* @param col
* @return
* @throws Exception
*/
public String getTableValue(int row, int col) throws Exception {
Parser parser;
NodeList nodelist;
parser = Parser.createParser(content, zhCode);
NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { tableFilter });
nodelist = parser.parse(lastFilter);
Node[] nodes = nodelist.toNodeArray();
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TableTag) // 处理TABLE类型NODE节点
{
TableTag tablenode = (TableTag) node;
TableRow trow = tablenode.getRow(row);
TableColumn[] tcolumns = trow.getColumns();
String rowtext = "";
TableColumn tcolumn = tcolumns[col];
Node[] colnodes = tcolumn.getChildrenAsNodeArray();
for (Node colHref : colnodes) {
if (colHref instanceof LinkTag) {
LinkTag link = (LinkTag) colHref;
String cline = link.getLink().trim();
rowtext += "Name:" + link.getLinkText() + "@"; // 对每列数值添加@分隔符,在最终处理时以此进行分隔
rowtext += "Link:" + cline + "@";
} else {
String tx = colHref.getText().trim();
rowtext += "Other:" + tx + "@";
}
}
// print(rowtext);
return rowtext;
}
}
return null;
}
/**
* 获取TABLE指定行所有列值集合的LIST
*
* @param row
* @return
* @throws Exception
*/
public ArrayList<String> getTableValueList(int row) throws Exception {
ArrayList<String> valuelist = new ArrayList<String>();
Parser parser;
NodeList nodelist;
parser = Parser.createParser(content, zhCode);
NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { tableFilter });
nodelist = parser.parse(lastFilter);
Node[] nodes = nodelist.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TableTag) {
TableTag tablenode = (TableTag) node;
line = tablenode.getText();
TableRow trow = tablenode.getRow(row);
TableColumn[] tcolumns = trow.getColumns();
for (int m = 0; m < tcolumns.length; m++) {
String rowtext = "";
TableColumn tcolumn = tcolumns[m];
Node[] colnodes = tcolumn.getChildrenAsNodeArray();
for (Node colHref : colnodes) {
if (colHref instanceof LinkTag) {
LinkTag link = (LinkTag) colHref;
String cline = link.getLink();
rowtext += "Name:" + link.getLinkText() + "@";
rowtext += "Link:" + cline + "@";
} else {
String tx = colHref.getText();
rowtext += "Other:" + tx + "@";
}
}
print(rowtext);
valuelist.add(rowtext);
}
}
if (isTrimEmpty(line))
continue;
}
return valuelist;
}
public void readTable(int row) throws Exception {
Parser parser;
NodeList nodelist;
parser = Parser.createParser(content, zhCode);
NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { tableFilter });
nodelist = parser.parse(lastFilter);
Node[] nodes = nodelist.toNodeArray();
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TableTag) {
TableTag tablenode = (TableTag) node;
line = tablenode.getText();
TableRow trow = tablenode.getRow(row);
if (trow == null)
continue;
TableColumn[] tcolumns = trow.getColumns();
for (int m = 0; m < tcolumns.length; m++) {
String rowtext = "";
print("--------Col: " + m + " ---------");
TableColumn tcolumn = tcolumns[m];
Node[] colnodes = tcolumn.getChildrenAsNodeArray();
for (Node colHref : colnodes) {
if (colHref instanceof LinkTag) {
LinkTag link = (LinkTag) colHref;
String cline = link.getLink();
rowtext += "Name: " + link.getLinkText() + " ";
rowtext += "Link: " + cline + " ";
} else {
String tx = colHref.getText();
rowtext += "Other Text: " + tx + " ";
}
}
print(rowtext);
print("-------------------");
}
int count = tablenode.getRowCount();
print("Count is: " + String.valueOf(count));
}
if (isTrimEmpty(line))
continue;
// print(line);
}
}
/**
* 根据表头名获取指定表NODE
*
* @param name
* @return
* @throws ParserException
*/
private TableTag getArrayTableNode(String name) throws ParserException {
int attrColSize = 2; // 特定HTML的数据成员限制为2列
// String spaceTag = " "; // 部分字段可能包含有格式字符,最终需要处理掉
// ArrayList<String[]> attrlist = new ArrayList<String[]>();
Parser parser;
NodeList nodelist;
parser = Parser.createParser(content, zhCode);
NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { tableFilter });
nodelist = parser.parse(lastFilter);
Node[] nodes = nodelist.toNodeArray();
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TableTag) // 过滤出TABLE标记节点
{
TableTag tablenode = (TableTag) node;
int count = tablenode.getRowCount(); // 表行数
for (int m = 0; m < count; m++) {
// String[] values = new String[attrColSize];
TableRow trow = tablenode.getRow(m);
TableColumn[] tcolumns = trow.getColumns();
if (tcolumns.length < attrColSize) // 如果当前行列数小于2,则认为不是数据成员表
{
if (tcolumns.length == 1) {
TableColumn tcol = tcolumns[0];
Node[] colnodes = tcol.getChildrenAsNodeArray();
for (Node colHref : colnodes) {
if (colHref instanceof HeadingTag) {
if (colHref.getChildren() != null) {
NodeList headlist = colHref
.getChildren();
for (int no = 0; no < headlist.size(); no++) {
Node headno = headlist
.elementAt(no);
String finalname = headno.getText();
if (finalname
.equalsIgnoreCase(name))
return tablenode;
}
}
String tblheadname = colHref.getText()
.trim();
System.out.println("tblheadname is: "
+ tblheadname);
}
}
}
continue;
}
}
}
}
return null;
}
private ArrayList<String[]> getAttrsList(TableTag tablenode)
throws Exception {
ArrayList<String[]> attrlist = new ArrayList<String[]>();
int attrColSize = 2;
String spaceTag = " ";
int rowcount = tablenode.getRowCount();
for (int i = 0; i < rowcount; i++) {
String[] values = new String[attrColSize + 1];
TableRow trow = tablenode.getRow(i);
TableColumn[] tcolumns = trow.getColumns();
if (tcolumns.length < attrColSize) // 如果当前行列数小于2,则认为不是数据成员表
continue;
for (int col = 0; col < attrColSize; col++) {
TableColumn tcolumn = tcolumns[col];
Node[] colnodes = tcolumn.getChildrenAsNodeArray();
for (Node colHref : colnodes) {
if (colnodes.length > 1) // 处理每单元格可能有多个NODE组成的情况
{
if (colHref instanceof LinkTag) // 如果当前NODE为超链接类型
{
LinkTag link = (LinkTag) colHref;
String tmpname = link.getLinkText();
if (values[col] == null) // 如果数组尚未赋值,则直接赋值
values[col] = tmpname.trim();
else
values[col] += tmpname.trim();
values[col + 1] = link.getLink();
} else {
String name = colHref.getText();
if (name.contains(spaceTag)) {
if (values[col] == null)
values[col] = name.replace(spaceTag, "");
else
values[col] += name.replace(spaceTag, "");
} else {
if (values[col] == null)
values[col] = name.trim();
else
values[col] += name.trim();
}
}
} else // 常规单元格为1个NODE,超链接类型或简单数据类型
{
if (colHref instanceof LinkTag) {
LinkTag link = (LinkTag) colHref;
String tmpname = link.getLinkText();
values[col] = tmpname.trim();
values[col + 1] = link.getLink();
} else {
String name = colHref.getText();
if (name.contains(spaceTag))
values[col] = name.replace(spaceTag, "");
else
values[col] = name.trim();
}
}
}
}
attrlist.add(values);
}
return attrlist;
}
/**
* 获取特定HTML页面的字段LIST,list节点为当前行的值数组
*
* @return
* @throws Exception
*/
public ArrayList<String[]> getAttrsList() throws Exception {
int attrColSize = 2; // 特定HTML的数据成员限制为2列
String spaceTag = " "; // 部分字段可能包含有格式字符,最终需要处理掉
ArrayList<String[]> attrlist = new ArrayList<String[]>();
Parser parser;
NodeList nodelist;
parser = Parser.createParser(content, zhCode);
NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { tableFilter });
nodelist = parser.parse(lastFilter);
Node[] nodes = nodelist.toNodeArray();
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TableTag) // 过滤出TABLE标记节点
{
TableTag tablenode = (TableTag) node;
int count = tablenode.getRowCount(); // 表行数
for (int m = 0; m < count; m++) {
String[] values = new String[attrColSize];
TableRow trow = tablenode.getRow(m);
TableColumn[] tcolumns = trow.getColumns();
if (tcolumns.length < attrColSize) // 如果当前行列数小于2,则认为不是数据成员表
{
if (tcolumns.length == 1) {
TableColumn tcol = tcolumns[0];
Node[] colnodes = tcol.getChildrenAsNodeArray();
for (Node colHref : colnodes) {
if (colHref instanceof HeadingTag) {
if (colHref.getChildren() != null) {
// NodeList headlist = colHref
// .getChildren();
// for (int no = 0; no < headlist.size(); no++) {
// Node headno = headlist
// .elementAt(no);
String finalname = headno.getText();
// }
}
String tblheadname = colHref.getText()
.trim();
System.out.println("tblheadname is: "
+ tblheadname);
}
}
}
continue;
}
for (int col = 0; col < attrColSize; col++) {
TableColumn tcolumn = tcolumns[col];
Node[] colnodes = tcolumn.getChildrenAsNodeArray();
for (Node colHref : colnodes) {
if (colnodes.length > 1) // 处理每单元格可能有多个NODE组成的情况
{
if (colHref instanceof LinkTag) // 如果当前NODE为超链接类型
{
LinkTag link = (LinkTag) colHref;
String tmpname = link.getLinkText();
if (values[col] == null) // 如果数组尚未赋值,则直接赋值
values[col] = tmpname.trim();
else
values[col] += tmpname.trim();
} else {
String name = colHref.getText();
if (name.contains(spaceTag)) {
if (values[col] == null)
values[col] = name.replace(
spaceTag, "");
else
values[col] += name.replace(
spaceTag, "");
} else {
if (values[col] == null)
values[col] = name.trim();
else
values[col] += name.trim();
}
}
} else // 常规单元格为1个NODE,超链接类型或简单数据类型
{
if (colHref instanceof LinkTag) {
LinkTag link = (LinkTag) colHref;
String tmpname = link.getLinkText();
values[col] = tmpname.trim();
} else {
String name = colHref.getText();
if (name.contains(spaceTag))
values[col] = name
.replace(spaceTag, "");
else
values[col] = name.trim();
}
}
}
}
attrlist.add(values);
}
}
}
return attrlist;
}
/**
* 去掉左右空格后字符串是否为空
*
* @param astr
* @return
*/
private boolean isTrimEmpty(String astr) {
if ((null == astr) || (astr.length() == 0))
return true;
if (isBlank(astr.trim()))
return true;
return false;
}
/**
* 按页面方式处理,解析标准的HTML页面
*
* @param content
* 页面的内容
* @throws Exception
*/
public void readByHtml() throws Exception {
Parser myParser;
myParser = Parser.createParser(content, zhCode);
HtmlPage visitor = new HtmlPage(myParser);
myParser.visitAllNodesWith(visitor);
String textInPage = visitor.getTitle();
print(textInPage);
NodeList nodelist;
nodelist = visitor.getBody();
print(nodelist.asString().trim());
}
/**
* 分别读纯文本和链接
*
* @param result
* 网页的内容
* @throws Exception
*/
public void readTextAndLinkAndTitle() throws Exception {
Parser parser;
NodeList nodelist;
parser = Parser.createParser(content, zhCode);
NodeFilter textFilter = new NodeClassFilter(TextNode.class); // 添加文本节点过滤类型
NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); // 添加超链接节点过滤类型
NodeFilter titleFilter = new NodeClassFilter(TitleTag.class); // 添加标题节点过滤类型
NodeFilter tableFilter = new NodeClassFilter(TableTag.class); // 添加表格节点过滤类型
OrFilter lastFilter = new OrFilter();
lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter,
titleFilter, tableFilter });
nodelist = parser.parse(lastFilter);
Node[] nodes = nodelist.toNodeArray(); // 获取当前页面所有节点LIST
String line = "";
for (int i = 0; i < nodes.length; i++) {
Node node = nodes[i];
if (node instanceof TextNode) {
TextNode textnode = (TextNode) node;
line = textnode.getText();
} else if (node instanceof LinkTag) {
LinkTag link = (LinkTag) node;
line = link.getLink();
} else if (node instanceof TitleTag) {
TitleTag titlenode = (TitleTag) node;
line = titlenode.getTitle();
} else if (node instanceof TableTag) {
TableTag tablenode = (TableTag) node;
line = tablenode.getText();
int count = tablenode.getRowCount();
print("Count is: " + String.valueOf(count));
}
if (isTrimEmpty(line))
continue;
print(line);
}
}
/**
* 字符串是否为空: null或长度为0
*
* @param astr
* @return
*/
private boolean isBlank(String astr) {
if ((null == astr) || (astr.length() == 0))
return true;
else
return false;
}
private void print(String info) {
System.out.println(info);
}
}
Html文件解析操作接口
最新推荐文章于 2024-02-20 17:55:32 发布