html转化生成dom

最新推荐文章于 2023-06-02 13:41:32 发布

Baocai3000

最新推荐文章于 2023-06-02 13:41:32 发布

阅读量2.4k

点赞数

分类专栏： JAVA 科研文章标签： html exception permissions printing parsing file

本文链接：https://blog.csdn.net/qibaoyuan/article/details/7435676

版权

科研同时被 2 个专栏收录

72 篇文章 1 订阅

订阅专栏

JAVA

45 篇文章 0 订阅

订阅专栏

/* 
 * Copyright 2002-2009 Andy Clark, Marc Guillemot
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package sample;

import java.io.File;
import java.io.FileWriter;
import java.io.StringWriter;

import javax.xml.transform.OutputKeys;
import javax.xml.transform.Result;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;

import org.cyberneko.html.parsers.DOMParser;
import org.dom4j.io.DOMReader;
import org.dom4j.io.OutputFormat;
import org.dom4j.io.XMLWriter;
import org.w3c.dom.Document;
import org.w3c.dom.Node;

/**
 * This program tests the NekoHTML parser's use of the HTML DOM implementation
 * by printing the class names of all the nodes in the parsed document.
 * 
 * @author Andy Clark
 * 
 * @version $Id: TestHTMLDOM.java,v 1.3 2004/02/19 20:00:17 andyc Exp $
 */
public class TestHTMLDOM {

	/**
	 * 
	 * org.w3c.dom.Document转成 org.dom4j.Document
	 * 
	 * @param doc
	 *            Document(org.w3c.dom.Document)
	 * 
	 * @throws Exception
	 * 
	 * @return Document
	 */

	public static org.dom4j.Document parse(org.w3c.dom.Document doc)
			throws Exception {
		if (doc == null) {
			return null;
		}
		DOMReader domReader = new DOMReader();

		return domReader.read(doc);
	}

	/** Main. */
	public static void main(String[] argv) throws Exception {

		// 0.生成解析对象
		DOMParser parser = new DOMParser();

		for (int i = 0; i < argv.length; i++) {

			// 1.调用neoko进行parser
			parser.parse(argv[i]);

			// 2.获得parsing的结果
			Document doc = parser.getDocument();

			// 3.将w3c的doc转成dom4j的doc，写入到与原来名称相同，后缀为xhtml的文件
			File file = new File(argv[i].substring(5, argv[i].lastIndexOf("."))
					+ ".xhtml");
			OutputFormat format = OutputFormat.createPrettyPrint();

			format.setEncoding("utf-8"); // XML中文乱码解决

			XMLWriter writer = new XMLWriter(new FileWriter(file), format);

			writer.write(parse(doc));

			writer.close();

		}
	}

	/** Prints a node's class name. */
	public static void print(Node node, String indent) {
		System.out.println(indent + node.getClass().getName());

		Node child = node.getFirstChild();
		while (child != null) {
			print(child, indent + " ");
			child = child.getNextSibling();
		}
	}

	/**
	 * w3c document 转xml
	 * 
	 * @param doc
	 * @return
	 * @throws TransformerFactoryConfigurationError
	 * @throws TransformerException
	 */
	public static String toString(Document doc)
			throws TransformerFactoryConfigurationError, TransformerException {
		DOMSource source = new DOMSource(doc);
		StringWriter writer = new StringWriter();
		Result result = new StreamResult(writer);
		Transformer transformer = TransformerFactory.newInstance()
				.newTransformer();
		transformer.setOutputProperty(OutputKeys.INDENT, "yes");
		transformer.setOutputProperty(OutputKeys.CDATA_SECTION_ELEMENTS, "yes");
		transformer.setOutputProperty(
				"{http://xml.apache.org/xslt}indent-amount", "2");
		transformer.transform(source, result);
		return (writer.getBuffer().toString());
	}

}

Baocai3000

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
html转化生成dom

/* * Copyright 2002-2009 Andy Clark, Marc Guillemot * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You
复制链接

扫一扫