<script language="JavaScript" type="text/javascript"> function doZoom(size){ document.getElementById('zoom').style.fontSize=size+'px'; } </script>
先提出问题: 都知道,windows XP 的默认编码方式是GBK, 验证如下: Charset defaultCharset=java.nio.charset.Charset.defaultCharset(); String name=defaultCharset.name();//输入GBK 也可以从另一个编码的角度来验证: String testStr = "中国"; byte[] bytes = testStr.getBytes(); //[-42, -48, -71, -6] byte[] gbkbytes = testStr.getBytes("GBK"); //[-42, -48, -71, -6] 是相同的! 这样,问题就来了,如果我想对字符进行UTF-8格式的编码,怎么做? 下面就讨论这个问题。 JDK本身没有提供对字符串进行编码的接口,所有编码相关的类都位于sun.io.*;和java.nio.charset;中.
其实,有一个私有类被隐藏在JDK中,那就是下面要介绍的对String进行编码、解码的工具类:
java.lang.StringCoding;
它是一个内部类,所有的方法都是默认访问的,一位着只有在lang包中的程序才可见它。
直接把这个类复制来,然后把构造访问值改为public (该类的源码附后)
比如要对一个字符串,进行UTF-8编码 String testStr = "中国";
//转换为char数组 char[] defaultChars = {‘中’,‘国’};
//用UTF-8进行编码(encode) byte[] utfbytes = StringCoding.encode("UTF-8", defaultChars, 0, defaultChars.length); //用UTF-8进行解码(decode) char[] utfChars=StringCoding.decode("UTF-8", utfbytes, 0,utfbytes.length);
//将转换编码后的字符串打印出来 String utfStr=Arrays.toString(utfChars);
附源文件 StringCoding:
/* * @(#)StringCoding.java 1.13 03/12/19 * * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms. */
package java.lang;
import java.io.CharConversionException; import java.io.UnsupportedEncodingException; import java.lang.ref.SoftReference; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.BufferOverflowException; import java.nio.BufferUnderflowException; import java.nio.charset.Charset; import java.nio.charset.CharsetDecoder; import java.nio.charset.CharsetEncoder; import java.nio.charset.CharacterCodingException; import java.nio.charset.CoderResult; import java.nio.charset.CodingErrorAction; import java.nio.charset.IllegalCharsetNameException; import java.nio.charset.MalformedInputException; import java.nio.charset.UnsupportedCharsetException; import sun.io.ByteToCharConverter; import sun.io.CharToByteConverter; import sun.io.Converters; import sun.misc.MessageUtils; import sun.nio.cs.HistoricallyNamedCharset;
/** * Utility class for string encoding and decoding. */
class StringCoding {
private StringCoding() { }
/* The cached coders for each thread */ private static ThreadLocal decoder = new ThreadLocal(); private static ThreadLocal encoder = new ThreadLocal();
private static boolean warnUnsupportedCharset = true;
private static Object deref(ThreadLocal tl) { SoftReference sr = (SoftReference)tl.get(); if (sr == null) return null; return sr.get(); }
private static void set(ThreadLocal tl, Object ob) { tl.set(new SoftReference(ob)); }
// Trim the given byte array to the given length // private static byte[] trim(byte[] ba, int len) { if (len == ba.length) return ba; byte[] tba = new byte[len]; System.arraycopy(ba, 0, tba, 0, len); return tba; }
// Trim the given char array to the given length // private static char[] trim(char[] ca, int len) { if (len == ca.length) return ca; char[] tca = new char[len]; System.arraycopy(ca, 0, tca, 0, len); return tca; }
private static Charset lookupCharset(String csn) { if (Charset.isSupported(csn)) { try { return Charset.forName(csn); } catch (UnsupportedCharsetException x) { throw new Error(x); } } return null; }
private static void warnUnsupportedCharset(String csn) { if (warnUnsupportedCharset) { // Use sun.misc.MessageUtils rather than the Logging API or // System.err since this method may be called during VM // initialization before either is available. MessageUtils.err("WARNING: Default charset " + csn + " not supported, using ISO-8859-1 instead"); warnUnsupportedCharset = false; } }
// -- Decoding --
// Encapsulates either a ByteToCharConverter or a CharsetDecoder // private static abstract class StringDecoder { private final String requestedCharsetName; protected StringDecoder(String requestedCharsetName) { this.requestedCharsetName = requestedCharsetName; } final String requestedCharsetName() { return requestedCharsetName; } abstract String charsetName(); abstract char[] decode(byte[] ba, int off, int len); }
// A string decoder based upon a ByteToCharConverter // private static class ConverterSD extends StringDecoder { private ByteToCharConverter btc;
private ConverterSD(ByteToCharConverter btc, String rcn) { super(rcn); this.btc = btc; }
String charsetName() { return btc.getCharacterEncoding(); }
char[] decode(byte[] ba, int off, int len) { int en = btc.getMaxCharsPerByte() * len; char[] ca = new char[en]; if (len == 0) return ca; btc.reset(); int n = 0; try { n = btc.convert(ba, off, off + len, ca, 0, en); n += btc.flush(ca, btc.nextCharIndex(), en); } catch (CharConversionException x) { // Yes, this is what we've always done n = btc.nextCharIndex(); } return trim(ca, n); }
}
// A string decoder based upon a CharsetDecoder // private static class CharsetSD extends StringDecoder { private final Charset cs; private final CharsetDecoder cd;
private CharsetSD(Charset cs, String rcn) { super(rcn); this.cs = cs; this.cd = cs.newDecoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); }
String charsetName() { if (cs instanceof HistoricallyNamedCharset) return ((HistoricallyNamedCharset)cs).historicalName(); return cs.name(); }
char[] decode(byte[] ba, int off, int len) { int en = (int)(cd.maxCharsPerByte() * len); char[] ca = new char[en]; if (len == 0) return ca; cd.reset(); ByteBuffer bb = ByteBuffer.wrap(ba, off, len); CharBuffer cb = CharBuffer.wrap(ca); try { CoderResult cr = cd.decode(bb, cb, true); if (!cr.isUnderflow()) cr.throwException(); cr = cd.flush(cb); if (!cr.isUnderflow()) cr.throwException(); } catch (CharacterCodingException x) { // Substitution is always enabled, // so this shouldn't happen throw new Error(x); } return trim(ca, cb.position()); }
}
static char[] decode(String charsetName, byte[] ba, int off, int len) throws UnsupportedEncodingException { StringDecoder sd = (StringDecoder)deref(decoder); String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; if ((sd == null) || !(csn.equals(sd.requestedCharsetName()) || csn.equals(sd.charsetName()))) { sd = null; try { Charset cs = lookupCharset(csn); if (cs != null) sd = new CharsetSD(cs, csn); else sd = null; } catch (IllegalCharsetNameException x) { // FALL THROUGH to ByteToCharConverter, for compatibility } if (sd == null) sd = new ConverterSD(ByteToCharConverter.getConverter(csn), csn); set(decoder, sd); } return sd.decode(ba, off, len); }
static char[] decode(byte[] ba, int off, int len) { String csn = Converters.getDefaultEncodingName(); try { return decode(csn, ba, off, len); } catch (UnsupportedEncodingException x) { Converters.resetDefaultEncodingName(); warnUnsupportedCharset(csn); } try { return decode("ISO-8859-1", ba, off, len); } catch (UnsupportedEncodingException x) { // If this code is hit during VM initialization, MessageUtils is // the only way we will be able to get any kind of error message. MessageUtils.err("ISO-8859-1 charset not available: " + x.toString()); // If we can not find ISO-8859-1 (a required encoding) then things // are seriously wrong with the installation. System.exit(1); return null; } }
// -- Encoding --
// Encapsulates either a CharToByteConverter or a CharsetEncoder // private static abstract class StringEncoder { private final String requestedCharsetName; protected StringEncoder(String requestedCharsetName) { this.requestedCharsetName = requestedCharsetName; } final String requestedCharsetName() { return requestedCharsetName; } abstract String charsetName(); abstract byte[] encode(char[] cs, int off, int len); }
// A string encoder based upon a CharToByteConverter // private static class ConverterSE extends StringEncoder { private CharToByteConverter ctb;
private ConverterSE(CharToByteConverter ctb, String rcn) { super(rcn); this.ctb = ctb; }
String charsetName() { return ctb.getCharacterEncoding(); }
byte[] encode(char[] ca, int off, int len) { int en = ctb.getMaxBytesPerChar() * len; byte[] ba = new byte[en]; if (len == 0) return ba;
ctb.reset(); int n; try { n = ctb.convertAny(ca, off, (off + len), ba, 0, en); n += ctb.flushAny(ba, ctb.nextByteIndex(), en); } catch (CharConversionException x) { throw new Error("Converter malfunction: " + ctb.getClass().getName(), x); } return trim(ba, n); }
}
// A string encoder based upon a CharsetEncoder // private static class CharsetSE extends StringEncoder { private Charset cs; private CharsetEncoder ce;
private CharsetSE(Charset cs, String rcn) { super(rcn); this.cs = cs; this.ce = cs.newEncoder() .onMalformedInput(CodingErrorAction.REPLACE) .onUnmappableCharacter(CodingErrorAction.REPLACE); }
String charsetName() { if (cs instanceof HistoricallyNamedCharset) return ((HistoricallyNamedCharset)cs).historicalName(); return cs.name(); }
byte[] encode(char[] ca, int off, int len) { int en = (int)(ce.maxBytesPerChar() * len); byte[] ba = new byte[en]; if (len == 0) return ba;
ce.reset(); ByteBuffer bb = ByteBuffer.wrap(ba); CharBuffer cb = CharBuffer.wrap(ca, off, len); try { CoderResult cr = ce.encode(cb, bb, true); if (!cr.isUnderflow()) cr.throwException(); cr = ce.flush(bb); if (!cr.isUnderflow()) cr.throwException(); } catch (CharacterCodingException x) { // Substitution is always enabled, // so this shouldn't happen throw new Error(x); } return trim(ba, bb.position()); }
}
static byte[] encode(String charsetName, char[] ca, int off, int len) throws UnsupportedEncodingException { StringEncoder se = (StringEncoder)deref(encoder); String csn = (charsetName == null) ? "ISO-8859-1" : charsetName; if ((se == null) || !(csn.equals(se.requestedCharsetName()) || csn.equals(se.charsetName()))) { se = null; try { Charset cs = lookupCharset(csn); if (cs != null) se = new CharsetSE(cs, csn); } catch (IllegalCharsetNameException x) { // FALL THROUGH to CharToByteConverter, for compatibility } if (se == null) se = new ConverterSE(CharToByteConverter.getConverter(csn), csn); set(encoder, se); } return se.encode(ca, off, len); }
static byte[] encode(char[] ca, int off, int len) { String csn = Converters.getDefaultEncodingName(); try { return encode(csn, ca, off, len); } catch (UnsupportedEncodingException x) { Converters.resetDefaultEncodingName(); warnUnsupportedCharset(csn); } try { return encode("ISO-8859-1", ca, off, len); } catch (UnsupportedEncodingException x) { // If this code is hit during VM initialization, MessageUtils is // the only way we will be able to get any kind of error message. MessageUtils.err("ISO-8859-1 charset not available: " + x.toString()); // If we can not find ISO-8859-1 (a required encoding) then things // are seriously wrong with the installation. System.exit(1); return null; } }
} |