JTidy转换html到xml 方法一:现无法解决乱码 package spide; import java.io.PrintWriter; import java.io.FileInputStream; import java.io.IOException; import org.w3c.dom.Attr; import org.w3c.dom.Document; import org.w3c.dom.NamedNodeMap; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.tidy.Tidy; import java.io.FileOutputStream; /** * A sample DOM writer. This sample program illustrates how to * traverse a DOM tree in order to print a document that is parsed. * */ public class TestDOM { protected PrintWriter out; public TestDOM() { try { FileOutputStream outxml=new FileOutputStream("D:/test.xml"); out = new PrintWriter(outxml); } catch(Exception e) { e.printStackTrace(); } } /** Prints the specified node, recursively. */ public void print(Node node) { if ( node == null ) { return; } int type = node.getNodeType(); switch ( type ) { case Node.DOCUMENT_NODE: out.println("<?xml version=/"1.0/" encoding=/"GBK/"?>"); print(((Document)node).getDocumentElement()); out.flush(); break; case Node.ELEMENT_NODE: out.print('<'); out.print(node.getNodeName()); NamedNodeMap attrs = node.getAttributes(); for ( int i = 0; i < attrs.getLength(); i++ ) { out.print(' '); out.print(attrs.item(i).getNodeName()); out.print("=/""); out.print(attrs.item(i).getNodeValue()); out.print('"'); } out.print('>'); out.println(); // HACK NodeList children = node.getChildNodes(); if ( children != null ) { int len = children.getLength(); for ( int i = 0; i < len; i++ ) { print(children.item(i)); } } break; case Node.TEXT_NODE: out.print(node.getNodeValue()); break; } if ( type == Node.ELEMENT_NODE ) { out.print("</"); out.print(node.getNodeName()); out.print('>'); out.println(); // HACK } out.flush(); } public static void main(String args[]) { String conf="D:/tidy.properties"; FileInputStream in; Tidy tidy = new Tidy(); tidy.setConfigurationFromFile(conf); TestDOM t = new TestDOM(); try { in = new FileInputStream("D:/speed.html"); tidy.setMakeClean(true); tidy.setXmlTags(true); t.print(tidy.parseDOM(in, null)); } catch ( IOException e ) { System.err.println( e.toString() ); } } } 方法二:可以解决乱码,解析时出现 White spaces are required between publicId and systemId错误 package spide; import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.BufferedWriter; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.FileWriter; import java.io.IOException; import java.io.PrintWriter; import org.w3c.tidy.Tidy; public class Test17 implements Runnable { private String srcFileName; private String outFileName; private String errOutFileName; private String configFileName; public Test17(String srcFileName, String outFileName,??? String confName) { this.srcFileName = srcFileName; this.outFileName = outFileName; this.configFileName= confName; } public void run() { BufferedInputStream in; FileOutputStream out; Tidy tidy = new Tidy(); tidy.setConfigurationFromFile(configFileName); try { // tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true)); in = new BufferedInputStream(new FileInputStream(srcFileName)); out = new FileOutputStream(outFileName); String head = "<?xml version=/"1.0/" encoding=/"GBK/"?>"; byte[] bytes = head.getBytes(); out.write(bytes, 0, bytes.length); tidy.parse(in, out); } catch (IOException e) { System.out.println(this.toString() + e.toString()); } } public static void main(String[] args) { String src="D:/speed.html"; String out="D:/result.xml"; String err="D:/err.txt"; String conf="D:/tidy.properties"; Test17 t1 = new Test17(src,out,conf); Thread th1 = new Thread(t1); th1.start(); } }