java w3c解析xml乱码_解决JTidy HTML=>XML中文乱码 | 学步园

首先将源网页用UTF-8重新编码放到一个新的文件,

还要注意加上:

tidy.setInputEncoding("UTF-8");

才能正确显示

源代码如下:

import java.net.URL;

import java.util.logging.Level;

import java.util.logging.Logger;

import java.io.*;

import org.w3c.tidy.Tidy;

public class xml {

private String url;

private String outFileName;

private String errOutFileName;

public xml(String url, String outFileName, String

errOutFileName) {

this.url = url;

this.outFileName = outFileName;

this.errOutFileName = errOutFileName;

}

public void convert() {

URL u;

BufferedInputStream in;

FileOutputStream out;

Logger log = Logger.getLogger("convert");

try {

u = new URL(url);

//Create input and output streams

in = new BufferedInputStream(u.openStream()); // 打开文件,转换为 UTF-8 编码

InputStreamReader isr = new InputStreamReader(in, "GB2312"); // 源文件编码为 gb2312

File tmpNewFile = File.createTempFile("GB2312",".html"); // 转换后的文件,设定编码为 utf-8

out = new FileOutputStream( tmpNewFile ); // 需要将文件转换为字符流

OutputStreamWriter osw = new OutputStreamWriter( out , "UTF-8"); // 指定目标编码为 utf-8

osw.write("<?xml version=/"1.0/" encoding=/"utf-8/"?>/n");

char[] buffer = new char[10240]; // 文件缓冲区

int len = 0; // 使用字符读取方式,循环读取源文件内容

while( (len = isr.read(buffer)) !=-1 ) // 转换后写入目标文件中

{

osw.write( buffer, 0, len);

}

osw.close(); // 转换完成

isr.close();

out.close();

in.close();

if( log.isLoggable( Level.INFO)){

log.info("HTML 文档转 UTF-8 编码完成!");

}

//设置tidy

Tidy tidy = new Tidy();

// Set file for error messages

tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true));

// Tell Tidy to convert HTML to XML

tidy.setXmlOut(true);

tidy.setInputEncoding("UTF-8");

FileInputStream in0 = new FileInputStream( tmpNewFile );

FileOutputStream out0 = new FileOutputStream(outFileName);

//Convert files

tidy.parse(in0, out0);

//Clean up

in.close();

out.close();

tmpNewFile.delete(); // 删除临时文件

} catch (IOException e) {

System.out.println(this.toString() + e.toString());

}

}

public static void main(String[] args) {

/*

* Parameters are:

* URL of HTML file

* Filename of output file

* Filename of error file

*/

String u="http://www.baidu.com/";

String o="index.xml";

String e="error.xml";

xml t = new xml(u, o, e);

t.convert();

System.out.println("OK!");

}

}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值