Html文件解析操作接口

最新推荐文章于 2024-02-20 17:55:32 发布
purplum
最新推荐文章于 2024-02-20 17:55:32 发布
阅读量660
点赞数
本文链接：https://blog.csdn.net/yinxinla/article/details/9070455
版权
import java.io.BufferedReader;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Iterator;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.filters.OrFilter;
import org.htmlparser.filters.TagNameFilter;
import org.htmlparser.nodes.TagNode;
import org.htmlparser.nodes.TextNode;
import org.htmlparser.tags.Bullet;
import org.htmlparser.tags.BulletList;
import org.htmlparser.tags.HeadingTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.TableColumn;
import org.htmlparser.tags.TableRow;
import org.htmlparser.tags.TableTag;
import org.htmlparser.tags.TitleTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;


/**
 * Html文件处理接口
 * 
 */
public class HtmlParser {

	private String path = "file:///C:\\temp\\oAAAAam\\_table.html";
	private String content = "";
	private String zhCode = "GBK"; // 中文编码方式默认GBK，可手动修改为utf-8,utf-16
	// private String[] SimpleStruct = new
	// String[]{"BYTE","WORD16","WORD32","DWORD"};
	private ArrayList<String> SimpleStruct = new ArrayList<String>();

	public HtmlParser() {

		try {
			build();
		} catch (Exception e) {
			// TODO: handle exception
		}
	}

	public HtmlParser(String path) {
		if (!path.startsWith("file:"))
			this.path = "file:///" + path;
		else
			this.path = path;

		try {
			build();
		} catch (Exception e) {
			// TODO: handle exception
		}
	}

	public HtmlParser(String path, String code) {
		if (!path.startsWith("file:"))
			this.path = "file:///" + path;
		else
			this.path = path;
		this.zhCode = code;

		try {
			build();
		} catch (Exception e) {
			// TODO: handle exception
		}
	}

	private void initialArray() {
		SimpleStruct.add("BYTE");
		SimpleStruct.add("WORD16");
		SimpleStruct.add("WORD32");
		SimpleStruct.add("DWORD");
	}

	/**
	 * 初始构建方法，适用于所有HTML类型文件
	 * 
	 * @throws Exception
	 */
	private void build() throws Exception {
		initialArray();
		URL url = new URL(path);
		URLConnection conn = url.openConnection();
		conn.setDoOutput(true);

		InputStream inputStream = conn.getInputStream();
		InputStreamReader isr = new InputStreamReader(inputStream, zhCode);
		StringBuffer sb = new StringBuffer();
		BufferedReader in = new BufferedReader(isr);
		String inputLine;

		while ((inputLine = in.readLine()) != null) {
			sb.append(inputLine);
			sb.append("\n");
		}

		this.content = sb.toString();
	}

	/**
	 * BYTE:Other:BYTE @ WORD:Other:WORD16 @ Other:WORD32 @
	 * 
	 * @param args
	 * @throws Exception
	 */
	public static void main(String[] args) throws Exception {
		HtmlParser hp = new HtmlParser();

		String tblname = "DSAAA_SDH_A";
		String name = hp.getLinkByTblName(tblname)[1];
		if (name == null) {
			return;
		}
		if (name.equals("1")) // 如果返回1，认为附加信息为"无"
		{
			System.out.println("name: " + "无");
			System.out.println("type: " + "无");
		} else {
			System.out.println("name: " + name);
			String type = hp.getTypeByTblName(tblname);
			System.out.println("type: " + type);

			// 获取字段类型表值
			String path2 = "file:///C:\\temp\\oaaaaaam\\" + name;
			HtmlParser hp2 = new HtmlParser(path2);

			/** test 数据成员 **/
			TableTag tblnode = hp2.getArrayTableNode("数据成员");
			ArrayList<String[]> arraylist = hp2.getAttrsList(tblnode);
			Iterator<String[]> it = arraylist.iterator();
			while (it.hasNext()) {
				String[] tmp = it.next();
				String strs = "";
				for (String str : tmp)
					strs += str + ":";
				System.out.println("str: " + strs);
			}

			/** ----test 数据成员 **/

			/** test Div Node **/
//			Node divNode = hp2
//					.getDivByTdName("struct_t___s_d_h_abnormal.html#058c411ea007611138efe6d89f6ba9c5");
//			ArrayList<Node[]> listNodeArray = hp2.getBulList(divNode);
//			ArrayList<String[]> enumArray = hp2.getEnumList(listNodeArray);
//			ArrayList<String[]> enumTypeArray = hp2
//					.getEnumTypeList(listNodeArray);

			/** ---test Div Node **/

			// ArrayList<String[]> attrlist = hp2.getAttrsList();
			// Iterator<String[]> it = attrlist.iterator();
			// while(it.hasNext())
			// {
			// String[] tmp = it.next();
			// for(String str:tmp)
			// System.out.println("str: "+str);
			// }
		}
	}

	public ArrayList<String[]> getAttrArrayList(String macroname)
			throws Exception {
		// 获取告警宏表值
		String tblname = macroname;
		String[] fullTblStr = getLinkByTblName(tblname);
		if (fullTblStr == null) {
			return null;
		}
		String name = fullTblStr[1];
		if (name.equals("1")) // 如果返回1，认为附加信息为"无"
		{
			System.out.println("name: " + "无");
			System.out.println("type: " + "无");
		} else {
			System.out.println("name: " + name);
			String type = getTypeByTblName(tblname);
			System.out.println("type: " + type);

			// 获取字段类型表值
			String path2 = "file:///" + AConst.usrpath
					+ "files\\ChmFiles\\oaaam\\" + name;
			HtmlParser hp2 = new HtmlParser(path2);

			/** test 数据成员 SimpleStruct **/
			if (SimpleStruct.contains(type)) 
			{
				ArrayList<String[]> arraylist = getSimpleList(macroname,
						fullTblStr[0], type);
				return arraylist;
			} else {
				TableTag tblnode = hp2.getArrayTableNode("数据成员");
				ArrayList<String[]> arraylist = hp2.getAttrsList(tblnode);
				return arraylist;
			}
		}
		return null;
	}

	private ArrayList<String[]> getSimpleList(String macroname, String fulllnk,
			String type) throws Exception {
		ArrayList<String[]> al = new ArrayList<String[]>();
		String[] simpleArray = new String[3];
		String lnkstr = fulllnk.split("@Link:")[1];
		lnkstr = lnkstr.substring(0, lnkstr.indexOf("@"));
		simpleArray[2] = lnkstr;
		String namestr = fulllnk.split("@Link:")[0];
		String name = namestr.substring(namestr.indexOf("::") + 2);
		simpleArray[0] = type;
		simpleArray[1] = name;

		al.add(simpleArray);
		return al;
	}

	public String getHrefName(String macroname) throws Exception {
		String tblname = macroname;
		String name = getLinkByTblName(tblname)[1];
		return name;
	}

	/**
	 * 获取字段文档映射信息
	 * 
	 * @param path
	 * @param hrefname
	 * @return
	 * @throws ParserException
	 */
	public ArrayList<String[]> getEnumTypeList(String path, String hrefname)
			throws ParserException {
		String path2 = "file:///" + AConst.usrpath
				+ "files\\ChmFiles\\oaaaam\\" + path;
		HtmlParser hp2 = new HtmlParser(path2);
		/** test Div Node **/
		Node divNode = hp2.getDivByTdName(hrefname);
		ArrayList<Node[]> listNodeArray = hp2.getBulList(divNode);

		// 处理字段详细对应信息
		ArrayList<String[]> enumTypeArray = hp2.getEnumTypeList(listNodeArray);

		return enumTypeArray;
	}

	/**
	 * 获取字段文档中ENUM数据表
	 * 
	 * @param path
	 *            通过宏名获取到字段文档的文件名称
	 * @param hrefname
	 *            字段内DIV的HREF链接名称
	 * @return
	 * @throws ParserException
	 */
	public ArrayList<String[]> getEnumArrayList(String path, String hrefname)
			throws ParserException {
		String path2 = "file:///" + AConst.usrpath
				+ "files\\ChmFiles\\oaaam\\" + path;
		HtmlParser hp2 = new HtmlParser(path2);
		/** test Div Node **/
		Node divNode = hp2.getDivByTdName(hrefname);
		ArrayList<Node[]> listNodeArray = hp2.getBulList(divNode);
		ArrayList<String[]> enumArray = hp2.getEnumList(listNodeArray);

		return enumArray;
	}

	private int getZhHead(TableTag tblNode) {
		Node rowNode = tblNode.childAt(1);
		if (rowNode instanceof TableRow) {
			TableRow rownode = (TableRow) rowNode;
			for (int j = 0; j < rownode.getChildCount(); j++) {
				Node colNode = rownode.childAt(j);
				if (colNode instanceof TableColumn) {
					TableColumn colnode = (TableColumn) colNode;
					Node childNode = colnode.getFirstChild();
					String head = childNode.getText();
					if (head.contains("中文"))
						return j;
				}
			}
		}
		return -1;
	}

	private int getEnHead(TableTag tblNode) {
		Node rowNode = tblNode.childAt(1);
		if (rowNode instanceof TableRow) {
			TableRow rownode = (TableRow) rowNode;
			for (int j = 0; j < rownode.getChildCount(); j++) {
				Node colNode = rownode.childAt(j);
				if (colNode instanceof TableColumn) {
					TableColumn colnode = (TableColumn) colNode;
					Node childNode = colnode.getFirstChild();
					String head = childNode.getText();
					if (head.contains("英文"))
						return j;
				}
			}
		}
		return -1;
	}

	/**
	 * 获取指定DIV模块中的ENUM数组列表
	 * 
	 * @param listNodeArray
	 * @return
	 */
	private ArrayList<String[]> getEnumList(ArrayList<Node[]> listNodeArray) {
		Iterator<Node[]> it = listNodeArray.iterator();
		int zhHead = 0;
		int enHead = 0;
		while (it.hasNext()) {
			Node[] nodes = it.next();
			if (nodes[2] != null) {
				TableTag tblNode = (TableTag) nodes[2];
				ArrayList<String[]> enumlist = new ArrayList<String[]>();
				for (int i = 2; i < tblNode.getChildCount(); i++) {
					zhHead = getZhHead(tblNode);
					enHead = getEnHead(tblNode);
					Node rowNode = tblNode.childAt(i);
					if (rowNode instanceof TableRow) {
						TableRow rownode = (TableRow) rowNode;
						String[] values = new String[3];
						int index = 0;
						for (int j = 0; j < rownode.getChildCount(); j++) {
							// if(rownode.getColumnCount()>3)
							// {
							// Node colNode = rownode.childAt(j);
							// if(colNode instanceof TableColumn)
							// {
							// TableColumn colnode = (TableColumn)colNode;
							// Node childNode = colnode.getFirstChild();
							// if(childNode instanceof TagNode)
							// {
							// TagNode tnode = (TagNode)childNode;
							// values[index++] = tnode.getText().trim();
							// }
							// else if(childNode!=null&&index<3)
							// {
							// TextNode cnode = (TextNode)childNode;
							// values[index++] = cnode.getText().trim();
							// }
							// else if(index<3)
							// {
							// values[index++] = "Null";
							// }
							// else
							// {
							// print("index is: "+index);
							// }
							//
							// Node zhcolNode = rownode.childAt(zhHead);
							// if(zhcolNode instanceof TableColumn)
							// {
							// TableColumn zhcol = (TableColumn)zhcolNode;
							// values[1] = zhcol.getFirstChild().getText();
							// }
							// Node encolNode = rownode.childAt(enHead);
							// if(encolNode instanceof TableColumn)
							// {
							// TableColumn encol = (TableColumn)encolNode;
							// values[2] = encol.getFirstChild().getText();
							// }
							// }
							// }
							// else
							// {
							Node colNode = rownode.childAt(j);
							if (colNode instanceof TableColumn) {
								TableColumn colnode = (TableColumn) colNode;
								Node childNode = colnode.getFirstChild();
								if (childNode instanceof TagNode && index < 3) {
									TagNode tnode = (TagNode) childNode;
									values[index++] = tnode.getText().trim();
								} else if (childNode != null && index < 3) {
									TextNode cnode = (TextNode) childNode;
									values[index++] = cnode.getText().trim();
								} else if (index < 3) {
									values[index++] = "Null";
								}
							}

							// }
						}
						if (values != null)
							enumlist.add(values);
					}
				}
				return enumlist;
			}
		}
		return null;
	}

	/**
	 * 获取字段文档中属性关系LIST
	 * 
	 * @param listNodeArray
	 * @return
	 */
	private ArrayList<String[]> getEnumTypeList(ArrayList<Node[]> listNodeArray) {
		Iterator<Node[]> it = listNodeArray.iterator();
		ArrayList<String[]> enumlist = new ArrayList<String[]>();
		while (it.hasNext()) {
			String[] typeArrays = new String[2];
			Node[] nodes = it.next();
			Node nameNode = nodes[0];
			if (nameNode instanceof TextNode)
				typeArrays[0] = nameNode.getText().trim();
			Node valueNode = nodes[1];
			if (valueNode instanceof TextNode)
				typeArrays[1] = valueNode.getText().trim();

			enumlist.add(typeArrays);
		}
		return enumlist;
	}

	/**
	 * 在TABLE中根据0列的NAME属性值，获取得到第2列的LINK文件名值
	 * 
	 * @param name
	 * @return
	 */
	public String[] getLinkByTblName(String name) throws Exception {
		String[] tmp = new String[2];
		int rowno = getTblNameNo(name);
		if (rowno < 0) {
			return null;
		}
		String lnkvalue = getTableValue(rowno, 2);
		tmp[0] = lnkvalue;
		if (lnkvalue == null)
			return null;
		if (!lnkvalue.contains("Link")) {
			tmp[1] = "1";
			return tmp;
		}
		if (!lnkvalue.contains("#"))
			lnkvalue = lnkvalue.substring(
					lnkvalue.indexOf("Link:") + "Link:".length(),
					lnkvalue.indexOf("html") + "html".length());
		else
			lnkvalue = lnkvalue.substring(
					lnkvalue.indexOf("Link:") + "Link:".length(),
					lnkvalue.indexOf("#"));

		tmp[1] = lnkvalue;
		return tmp;
	}

	/**
	 * 根据名称获取分布页面对应宏名称的段类型
	 * 
	 * @param name
	 * @return
	 * @throws Exception
	 */
	public String getTypeByTblName(String name) throws Exception {
		int rowno = getTblNameNo(name);
		String lnkvalue = getTableValue(rowno, 2);
		if (lnkvalue == null || !lnkvalue.contains("Link"))
			return null;
		lnkvalue = lnkvalue.substring(lnkvalue.indexOf("Other:")
				+ "Other:".length());
		lnkvalue = lnkvalue.substring(0, lnkvalue.indexOf("@"));

		return lnkvalue.trim();
	}

	/**
	 * 通过TD字节名称取到DIV字节属性块
	 * 
	 * @param name
	 * @throws ParserException
	 */
	private Node getDivByTdName(String name) throws ParserException {
		Parser parser;
		NodeList nodelist;
		parser = Parser.createParser(content, zhCode);
		NodeFilter divFilter = new TagNameFilter("div");
		NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
		nodelist = parser.extractAllNodesThatMatch(divFilter);
		Node[] nodes = nodelist.toNodeArray();
		for (int i = 0; i < nodes.length; i++) {
			Node node = nodes[i];
			NodeList inDivNodelist = node.getChildren();
			NodeList tblNodeList = inDivNodelist
					.extractAllNodesThatMatch(tableFilter);
			for (int m = 0; m < tblNodeList.size(); m++) {
				Node tblNode = tblNodeList.elementAt(m);
				TableTag tablenode = (TableTag) tblNode;
				for (int n = 0; n < tablenode.getChildCount(); n++) {
					Node trnode = tablenode.childAt(n);
					if (trnode instanceof TableRow) {
						TableRow trNode = (TableRow) trnode;
						for (int a = 0; a < trNode.getChildCount(); a++) {
							Node colNode = trNode.childAt(a);
							if (colNode instanceof TableColumn) {
								TableColumn colnode = (TableColumn) colNode;
								for (int b = 0; b < colnode.getChildCount(); b++) {
									Node attrNode = colnode.childAt(b);
									if (attrNode instanceof LinkTag) {
										LinkTag hrefNode = (LinkTag) attrNode;
										String href = hrefNode.getLink();
										if (name.equalsIgnoreCase(href))
											return nodes[i + 1];
									}
								}
							}
						}
					} else
						continue;
				}

			}
		}
		return null;
	}

	/**
	 * 在指定DIV结构中获取参数列表和ENUM列表NODE
	 * 
	 * @param node
	 * @return
	 */
	private ArrayList<Node[]> getBulList(Node node) {
		NodeFilter listFilter = new NodeClassFilter(BulletList.class);
		NodeList inDivNodelist = node.getChildren();
		NodeList blNodeList = inDivNodelist
				.extractAllNodesThatMatch(listFilter);
		for (int i = 0; i < blNodeList.size(); i++) {
			Node liNode = blNodeList.elementAt(i);
			if (liNode instanceof BulletList) {
				ArrayList<Node[]> blArrayList = new ArrayList<Node[]>();
				BulletList blist = (BulletList) liNode;
				for (int j = 0; j < blist.getChildCount(); j++) {
					Node inLiNode = blist.getChild(j);
					if (inLiNode instanceof Bullet) {
						Bullet bulnode = (Bullet) inLiNode;
						Node[] attrs = new Node[3];
						int index = 0;

						for (int n = 0; n < bulnode.getChildCount(); n++) {
							Node trnode = bulnode.childAt(n);
							if ((trnode instanceof TextNode) && index < 2) {
								TextNode trNode = (TextNode) trnode;
								attrs[index++] = trNode;
							} else if (trnode instanceof TableTag) {
								TableTag tTag = (TableTag) trnode;
								attrs[2] = tTag;
							} else
								continue;
						}

						blArrayList.add(attrs);
					}
				}

				return blArrayList;
			}
		}
		return null;
	}

	/**
	 * 获取指定名称的NODE在TABLE中的行号
	 * 
	 * @param name
	 * @return
	 * @throws Exception
	 */
	public int getTblNameNo(String name) throws Exception {
		Parser parser;
		NodeList nodelist;
		parser = Parser.createParser(content, zhCode);
		NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
		OrFilter lastFilter = new OrFilter();
		lastFilter.setPredicates(new NodeFilter[] { tableFilter });
		nodelist = parser.parse(lastFilter);
		Node[] nodes = nodelist.toNodeArray();

		for (int i = 0; i < nodes.length; i++) {
			Node node = nodes[i];
			if (node instanceof TableTag) {
				TableTag tablenode = (TableTag) node;
				int count = tablenode.getRowCount();
				for (int m = 0; m < count; m++) {
					TableRow trow = tablenode.getRow(m);
					TableColumn[] tcolumns = trow.getColumns();

					TableColumn tcolumn = tcolumns[0]; 
					Node[] colnodes = tcolumn.getChildrenAsNodeArray();
					for (Node colHref : colnodes) {
						if (colHref instanceof LinkTag)
						{
							LinkTag link = (LinkTag) colHref;
							String tmpname = link.getLinkText();
							if (tmpname.equalsIgnoreCase(name)) // 判断当前0列的单元格名称是否与入参一致
								return m;
						}
					}
				}

			}
		}
		return -1;
	}

	/**
	 * 获取指定行、列的TABLE的内容字符串，单元格内部各类型值之间以@分隔
	 * 
	 * @param row
	 * @param col
	 * @return
	 * @throws Exception
	 */
	public String getTableValue(int row, int col) throws Exception {
		Parser parser;
		NodeList nodelist;
		parser = Parser.createParser(content, zhCode);
		NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
		OrFilter lastFilter = new OrFilter();
		lastFilter.setPredicates(new NodeFilter[] { tableFilter });
		nodelist = parser.parse(lastFilter);
		Node[] nodes = nodelist.toNodeArray();

		for (int i = 0; i < nodes.length; i++) {
			Node node = nodes[i];
			if (node instanceof TableTag) // 处理TABLE类型NODE节点
			{
				TableTag tablenode = (TableTag) node;
				TableRow trow = tablenode.getRow(row);
				TableColumn[] tcolumns = trow.getColumns();

				String rowtext = "";
				TableColumn tcolumn = tcolumns[col];
				Node[] colnodes = tcolumn.getChildrenAsNodeArray();
				for (Node colHref : colnodes) {
					if (colHref instanceof LinkTag) {
						LinkTag link = (LinkTag) colHref;
						String cline = link.getLink().trim();
						rowtext += "Name:" + link.getLinkText() + "@"; // 对每列数值添加@分隔符,在最终处理时以此进行分隔
						rowtext += "Link:" + cline + "@";
					} else {
						String tx = colHref.getText().trim();
						rowtext += "Other:" + tx + "@";
					}
				}
				// print(rowtext);
				return rowtext;
			}
		}
		return null;
	}

	/**
	 * 获取TABLE指定行所有列值集合的LIST
	 * 
	 * @param row
	 * @return
	 * @throws Exception
	 */
	public ArrayList<String> getTableValueList(int row) throws Exception {
		ArrayList<String> valuelist = new ArrayList<String>();
		Parser parser;
		NodeList nodelist;
		parser = Parser.createParser(content, zhCode);
		NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
		OrFilter lastFilter = new OrFilter();
		lastFilter.setPredicates(new NodeFilter[] { tableFilter });
		nodelist = parser.parse(lastFilter);
		Node[] nodes = nodelist.toNodeArray();
		String line = "";

		for (int i = 0; i < nodes.length; i++) {
			Node node = nodes[i];
			if (node instanceof TableTag) {
				TableTag tablenode = (TableTag) node;
				line = tablenode.getText();
				TableRow trow = tablenode.getRow(row);
				TableColumn[] tcolumns = trow.getColumns();
				for (int m = 0; m < tcolumns.length; m++) {
					String rowtext = "";
					TableColumn tcolumn = tcolumns[m];
					Node[] colnodes = tcolumn.getChildrenAsNodeArray();
					for (Node colHref : colnodes) {
						if (colHref instanceof LinkTag) {
							LinkTag link = (LinkTag) colHref;
							String cline = link.getLink();
							rowtext += "Name:" + link.getLinkText() + "@";
							rowtext += "Link:" + cline + "@";
						} else {
							String tx = colHref.getText();
							rowtext += "Other:" + tx + "@";
						}
					}
					print(rowtext);
					valuelist.add(rowtext);
				}
			}

			if (isTrimEmpty(line))
				continue;
		}
		return valuelist;
	}

	public void readTable(int row) throws Exception {
		Parser parser;
		NodeList nodelist;
		parser = Parser.createParser(content, zhCode);
		NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
		OrFilter lastFilter = new OrFilter();
		lastFilter.setPredicates(new NodeFilter[] { tableFilter });
		nodelist = parser.parse(lastFilter);
		Node[] nodes = nodelist.toNodeArray();
		String line = "";

		for (int i = 0; i < nodes.length; i++) {
			Node node = nodes[i];
			if (node instanceof TableTag) {
				TableTag tablenode = (TableTag) node;
				line = tablenode.getText();
				TableRow trow = tablenode.getRow(row);
				if (trow == null)
					continue;
				TableColumn[] tcolumns = trow.getColumns();
				for (int m = 0; m < tcolumns.length; m++) {
					String rowtext = "";
					print("--------Col: " + m + " ---------");
					TableColumn tcolumn = tcolumns[m];
					Node[] colnodes = tcolumn.getChildrenAsNodeArray();
					for (Node colHref : colnodes) {
						if (colHref instanceof LinkTag) {
							LinkTag link = (LinkTag) colHref;
							String cline = link.getLink();
							rowtext += "Name: " + link.getLinkText() + " ";
							rowtext += "Link: " + cline + " ";
						} else {
							String tx = colHref.getText();
							rowtext += "Other Text: " + tx + " ";
						}
					}
					print(rowtext);
					print("-------------------");
				}
				int count = tablenode.getRowCount();
				print("Count is: " + String.valueOf(count));
			}

			if (isTrimEmpty(line))
				continue;
			// print(line);
		}
	}

	/**
	 * 根据表头名获取指定表NODE
	 * 
	 * @param name
	 * @return
	 * @throws ParserException
	 */
	private TableTag getArrayTableNode(String name) throws ParserException {
		int attrColSize = 2; // 特定HTML的数据成员限制为2列
//		String spaceTag = " "; // 部分字段可能包含有格式字符，最终需要处理掉
//		ArrayList<String[]> attrlist = new ArrayList<String[]>();
		Parser parser;
		NodeList nodelist;
		parser = Parser.createParser(content, zhCode);
		NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
		OrFilter lastFilter = new OrFilter();
		lastFilter.setPredicates(new NodeFilter[] { tableFilter });
		nodelist = parser.parse(lastFilter);
		Node[] nodes = nodelist.toNodeArray();

		for (int i = 0; i < nodes.length; i++) {
			Node node = nodes[i];
			if (node instanceof TableTag) // 过滤出TABLE标记节点
			{
				TableTag tablenode = (TableTag) node;
				int count = tablenode.getRowCount(); // 表行数
				for (int m = 0; m < count; m++) {
//					String[] values = new String[attrColSize];
					TableRow trow = tablenode.getRow(m);
					TableColumn[] tcolumns = trow.getColumns();
					if (tcolumns.length < attrColSize) // 如果当前行列数小于2，则认为不是数据成员表
					{
						if (tcolumns.length == 1) {
							TableColumn tcol = tcolumns[0];
							Node[] colnodes = tcol.getChildrenAsNodeArray();
							for (Node colHref : colnodes) {
								if (colHref instanceof HeadingTag) {
									if (colHref.getChildren() != null) {
										NodeList headlist = colHref
												.getChildren();
										for (int no = 0; no < headlist.size(); no++) {
											Node headno = headlist
													.elementAt(no);
											String finalname = headno.getText();
											if (finalname
													.equalsIgnoreCase(name))
												return tablenode;
										}
									}
									String tblheadname = colHref.getText()
											.trim();
									System.out.println("tblheadname is: "
											+ tblheadname);
								}

							}
						}
						continue;
					}
				}
			}
		}
		return null;
	}

	private ArrayList<String[]> getAttrsList(TableTag tablenode)
			throws Exception {
		ArrayList<String[]> attrlist = new ArrayList<String[]>();
		int attrColSize = 2;
		String spaceTag = " ";
		int rowcount = tablenode.getRowCount();
		for (int i = 0; i < rowcount; i++) {
			String[] values = new String[attrColSize + 1];
			TableRow trow = tablenode.getRow(i);
			TableColumn[] tcolumns = trow.getColumns();
			if (tcolumns.length < attrColSize) // 如果当前行列数小于2，则认为不是数据成员表
				continue;

			for (int col = 0; col < attrColSize; col++) {
				TableColumn tcolumn = tcolumns[col];
				Node[] colnodes = tcolumn.getChildrenAsNodeArray();
				for (Node colHref : colnodes) {
					if (colnodes.length > 1) // 处理每单元格可能有多个NODE组成的情况
					{
						if (colHref instanceof LinkTag) // 如果当前NODE为超链接类型
						{
							LinkTag link = (LinkTag) colHref;
							String tmpname = link.getLinkText();
							if (values[col] == null) // 如果数组尚未赋值，则直接赋值
								values[col] = tmpname.trim();
							else
								values[col] += tmpname.trim();

							values[col + 1] = link.getLink();
						} else {
							String name = colHref.getText();
							if (name.contains(spaceTag)) {
								if (values[col] == null)
									values[col] = name.replace(spaceTag, "");
								else
									values[col] += name.replace(spaceTag, "");
							} else {
								if (values[col] == null)
									values[col] = name.trim();
								else
									values[col] += name.trim();
							}
						}
					} else // 常规单元格为1个NODE，超链接类型或简单数据类型
					{
						if (colHref instanceof LinkTag) {
							LinkTag link = (LinkTag) colHref;
							String tmpname = link.getLinkText();
							values[col] = tmpname.trim();
							values[col + 1] = link.getLink();
						} else {
							String name = colHref.getText();
							if (name.contains(spaceTag))
								values[col] = name.replace(spaceTag, "");
							else
								values[col] = name.trim();
						}
					}
				}
			}
			attrlist.add(values);
		}
		return attrlist;
	}

	/**
	 * 获取特定HTML页面的字段LIST,list节点为当前行的值数组
	 * 
	 * @return
	 * @throws Exception
	 */
	public ArrayList<String[]> getAttrsList() throws Exception {
		int attrColSize = 2; // 特定HTML的数据成员限制为2列
		String spaceTag = " "; // 部分字段可能包含有格式字符，最终需要处理掉
		ArrayList<String[]> attrlist = new ArrayList<String[]>();
		Parser parser;
		NodeList nodelist;
		parser = Parser.createParser(content, zhCode);
		NodeFilter tableFilter = new NodeClassFilter(TableTag.class);
		OrFilter lastFilter = new OrFilter();
		lastFilter.setPredicates(new NodeFilter[] { tableFilter });
		nodelist = parser.parse(lastFilter);
		Node[] nodes = nodelist.toNodeArray();

		for (int i = 0; i < nodes.length; i++) {
			Node node = nodes[i];
			if (node instanceof TableTag) // 过滤出TABLE标记节点
			{
				TableTag tablenode = (TableTag) node;
				int count = tablenode.getRowCount(); // 表行数
				for (int m = 0; m < count; m++) {
					String[] values = new String[attrColSize];
					TableRow trow = tablenode.getRow(m);
					TableColumn[] tcolumns = trow.getColumns();
					if (tcolumns.length < attrColSize) // 如果当前行列数小于2，则认为不是数据成员表
					{
						if (tcolumns.length == 1) {
							TableColumn tcol = tcolumns[0];
							Node[] colnodes = tcol.getChildrenAsNodeArray();
							for (Node colHref : colnodes) {
								if (colHref instanceof HeadingTag) {
									if (colHref.getChildren() != null) {
//										NodeList headlist = colHref
//												.getChildren();
//										for (int no = 0; no < headlist.size(); no++) {
//											Node headno = headlist
//													.elementAt(no);
											String finalname = headno.getText();
//										}
									}
									String tblheadname = colHref.getText()
											.trim();
									System.out.println("tblheadname is: "
											+ tblheadname);
								}

							}
						}
						continue;
					}

					for (int col = 0; col < attrColSize; col++) {
						TableColumn tcolumn = tcolumns[col];
						Node[] colnodes = tcolumn.getChildrenAsNodeArray();
						for (Node colHref : colnodes) {
							if (colnodes.length > 1) // 处理每单元格可能有多个NODE组成的情况
							{
								if (colHref instanceof LinkTag) // 如果当前NODE为超链接类型
								{
									LinkTag link = (LinkTag) colHref;
									String tmpname = link.getLinkText();
									if (values[col] == null) // 如果数组尚未赋值，则直接赋值
										values[col] = tmpname.trim();
									else
										values[col] += tmpname.trim();
								} else {
									String name = colHref.getText();
									if (name.contains(spaceTag)) {
										if (values[col] == null)
											values[col] = name.replace(
													spaceTag, "");
										else
											values[col] += name.replace(
													spaceTag, "");
									} else {
										if (values[col] == null)
											values[col] = name.trim();
										else
											values[col] += name.trim();
									}
								}
							} else // 常规单元格为1个NODE，超链接类型或简单数据类型
							{
								if (colHref instanceof LinkTag) {
									LinkTag link = (LinkTag) colHref;
									String tmpname = link.getLinkText();
									values[col] = tmpname.trim();
								} else {
									String name = colHref.getText();
									if (name.contains(spaceTag))
										values[col] = name
												.replace(spaceTag, "");
									else
										values[col] = name.trim();
								}
							}
						}
					}
					attrlist.add(values);
				}
			}
		}
		return attrlist;
	}

	/**
	 * 去掉左右空格后字符串是否为空
	 * 
	 * @param astr
	 * @return
	 */
	private boolean isTrimEmpty(String astr) {
		if ((null == astr) || (astr.length() == 0))
			return true;
		if (isBlank(astr.trim()))
			return true;
		return false;
	}

	/**
	 * 按页面方式处理，解析标准的HTML页面
	 * 
	 * @param content
	 *            页面的内容
	 * @throws Exception
	 */
	public void readByHtml() throws Exception {
		Parser myParser;
		myParser = Parser.createParser(content, zhCode);
		HtmlPage visitor = new HtmlPage(myParser);
		myParser.visitAllNodesWith(visitor);
		String textInPage = visitor.getTitle();
		print(textInPage);
		NodeList nodelist;
		nodelist = visitor.getBody();

		print(nodelist.asString().trim());
	}

	/**
	 * 分别读纯文本和链接
	 * 
	 * @param result
	 *            网页的内容
	 * @throws Exception
	 */
	public void readTextAndLinkAndTitle() throws Exception {
		Parser parser;
		NodeList nodelist;
		parser = Parser.createParser(content, zhCode);
		NodeFilter textFilter = new NodeClassFilter(TextNode.class); // 添加文本节点过滤类型
		NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); // 添加超链接节点过滤类型
		NodeFilter titleFilter = new NodeClassFilter(TitleTag.class); // 添加标题节点过滤类型
		NodeFilter tableFilter = new NodeClassFilter(TableTag.class); // 添加表格节点过滤类型
		OrFilter lastFilter = new OrFilter();
		lastFilter.setPredicates(new NodeFilter[] { textFilter, linkFilter,
				titleFilter, tableFilter });
		nodelist = parser.parse(lastFilter);
		Node[] nodes = nodelist.toNodeArray(); // 获取当前页面所有节点LIST
		String line = "";

		for (int i = 0; i < nodes.length; i++) {
			Node node = nodes[i];
			if (node instanceof TextNode) {
				TextNode textnode = (TextNode) node;
				line = textnode.getText();
			} else if (node instanceof LinkTag) {
				LinkTag link = (LinkTag) node;
				line = link.getLink();
			} else if (node instanceof TitleTag) {
				TitleTag titlenode = (TitleTag) node;
				line = titlenode.getTitle();
			} else if (node instanceof TableTag) {
				TableTag tablenode = (TableTag) node;
				line = tablenode.getText();
				int count = tablenode.getRowCount();
				print("Count is: " + String.valueOf(count));
			}

			if (isTrimEmpty(line))
				continue;
			print(line);
		}
	}

	/**
	 * 字符串是否为空: null或长度为0
	 * 
	 * @param astr
	 * @return
	 */
	private boolean isBlank(String astr) {
		if ((null == astr) || (astr.length() == 0))
			return true;
		else
			return false;
	}

	private void print(String info) {
		System.out.println(info);
	}

}
purplum
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Html文件解析操作接口

import java.io.BufferedReader;import java.io.InputStream;import java.io.InputStreamReader;import java.net.URL;import java.net.URLConnection;import java.util.ArrayList;import java.util.Iterator;
复制链接

扫一扫