html转化生成dom

45 篇文章 0 订阅
/* 
 * Copyright 2002-2009 Andy Clark, Marc Guillemot
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package sample;

import java.io.File;
import java.io.FileWriter;
import java.io.StringWriter;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.cyberneko.html.parsers.DOMParser;
import org.dom4j.io.DOMReader;
import org.dom4j.io.OutputFormat;
import org.dom4j.io.XMLWriter;
import org.w3c.dom.Document;
import org.w3c.dom.Node;

/**
 * This program tests the NekoHTML parser's use of the HTML DOM implementation
 * by printing the class names of all the nodes in the parsed document.
 * 
 * @author Andy Clark
 * 
 * @version $Id: TestHTMLDOM.java,v 1.3 2004/02/19 20:00:17 andyc Exp $
 */
public class TestHTMLDOM {

	/**
	 * 
	 * org.w3c.dom.Document转成 org.dom4j.Document
	 * 
	 * @param doc
	 *            Document(org.w3c.dom.Document)
	 * 
	 * @throws Exception
	 * 
	 * @return Document
	 */

	public static org.dom4j.Document parse(org.w3c.dom.Document doc)
			throws Exception {
		if (doc == null) {
			return null;
		}
		DOMReader domReader = new DOMReader();

		return domReader.read(doc);
	}

	/** Main. */
	public static void main(String[] argv) throws Exception {

		// 0.生成解析对象
		DOMParser parser = new DOMParser();

		for (int i = 0; i < argv.length; i++) {

			// 1.调用neoko进行parser
			parser.parse(argv[i]);

			// 2.获得parsing的结果
			Document doc = parser.getDocument();

			// 3.将w3c的doc转成dom4j的doc,写入到与原来名称相同,后缀为xhtml的文件
			File file = new File(argv[i].substring(5, argv[i].lastIndexOf("."))
					+ ".xhtml");
			OutputFormat format = OutputFormat.createPrettyPrint();

			format.setEncoding("utf-8"); // XML中文乱码解决

			XMLWriter writer = new XMLWriter(new FileWriter(file), format);

			writer.write(parse(doc));

			writer.close();

		}
	}

	/** Prints a node's class name. */
	public static void print(Node node, String indent) {
		System.out.println(indent + node.getClass().getName());

		Node child = node.getFirstChild();
		while (child != null) {
			print(child, indent + " ");
			child = child.getNextSibling();
		}
	}

	/**
	 * w3c document 转xml
	 * 
	 * @param doc
	 * @return
	 * @throws TransformerFactoryConfigurationError
	 * @throws TransformerException
	 */
	public static String toString(Document doc)
			throws TransformerFactoryConfigurationError, TransformerException {
		DOMSource source = new DOMSource(doc);
		StringWriter writer = new StringWriter();
		Result result = new StreamResult(writer);
		Transformer transformer = TransformerFactory.newInstance()
				.newTransformer();
		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
		transformer.setOutputProperty(OutputKeys.CDATA_SECTION_ELEMENTS, "yes");
		transformer.setOutputProperty(
				"{http://xml.apache.org/xslt}indent-amount", "2");
		transformer.transform(source, result);
		return (writer.getBuffer().toString());
	}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值