android开发 WriteUTF与readUTF 原理

最新推荐文章于 2023-07-31 14:37:56 发布

weixin_34344677

最新推荐文章于 2023-07-31 14:37:56 发布

阅读量195

点赞数

文章标签：移动开发 java 操作系统

今晚上写代码玩，用到java.io.RandomAccessFile.writeUTF(String)函数，而文件默认保存为gbk，显然是乱码。突然想起来去看看存储编码规则，就去找了些文章了解writeUTF(String)的原理,在此记录。
首先需要弄明白unicode与utf8的表示规则，搜到@Feng哥的一篇文章《字符编码笔记：ASCII，Unicode和UTF-8》,写的很明白，在此招录一段：

| Unicode符号范围 | UTF-8编码方式

| 0000 0000-0000 007F | 0xxxxxxx
| 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
| 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
| 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
下面，还是以汉字"严"为例，演示如何实现UTF-8编码。
已知"严"的unicode是4E25（100111000100101），根据上表，可以发现4E25处在第三行的范围内（0000 0800-0000 FFFF），因此"严"的UTF-8编码需要三个字节，即格式是"1110xxxx 10xxxxxx 10xxxxxx”。然后，从"严"的最后一个二进制位开始，依次从后向前填入格式中的x，多出的位补0。这样就得到了，“严"的UTF-8编码是"11100100 10111000 10100101”，转换成十六进制就是E4B8A5。

也就是将4E25(100 111000 100101)依次填充到(1110xxxx 10xxxxxx 10xxxxxx)的x位置里面！

文章中还强调的一点思想就是:Unicode是为统一世界多种编码问题而制定的统一编码，就是UTF-8只是Unicode的一种实现方式。

打印System.out.println(Integer.toHexString('严'));,打印结果为4e25，为Unicode编码.当使用RandomAccessFile.writeUTF(String), “严"以utf8个是写入文件。

下面是源码，以及我做的一些备注，以后回忆时候好用：

java.io.DataOutputStream.writeUTF(String, DataOutput)

static int writeUTF(String str, DataOutput out) throws IOException {
    int strlen = str.length(); int utflen = 0; int c, count = 0; /* 根据c的大小决定存储长度utflen的大小,最大65535字节，也就是64kb */ for (int i = 0; i < strlen; i++) { c = str.charAt(i); if ((c >= 0x0001) && (c <= 0x007F)) { utflen++; } else if (c > 0x07FF) { utflen += 3; } else { utflen += 2; } } if (utflen > 65535) throw new UTFDataFormatException( "encoded string too long: " + utflen + " bytes"); /*创建"合适"长度字节数组bytearr，下面的+2是因为需要在bytearr前两字节中存储数据长度utflen*/ byte[] bytearr = null; if (out instanceof DataOutputStream) { DataOutputStream dos = (DataOutputStream)out; if(dos.bytearr == null || (dos.bytearr.length < (utflen+2))) dos.bytearr = new byte[(utflen*2) + 2]; bytearr = dos.bytearr; } else { bytearr = new byte[utflen+2]; } bytearr[count++] = (byte) ((utflen >>> 8) & 0xFF); bytearr[count++] = (byte) ((utflen >>> 0) & 0xFF); /*如果是ascii码就直接存在bytearr里面了,毕竟老外写的源码，都是用ascii码机率比较高，就省去下面for中的判断了*/ int i=0; for (i=0; i<strlen; i++) { c = str.charAt(i); if (!((c >= 0x0001) && (c <= 0x007F))) break; bytearr[count++] = (byte) c; } /*上面满足不了就用下面的。。*/ for (;i < strlen; i++){ c = str.charAt(i); if ((c >= 0x0001) && (c <= 0x007F)) { /*单字节,编码规则:0xxxxxxx ,ascii码处理*/ bytearr[count++] = (byte) c; } else if (c > 0x07FF) { /*三字节,编码规则:1110xxxx 10xxxxxx 10xxxxxx 参考上面的"严"字(100 111000 100101),则结果分别为(`1110`0100 `10`111000 `10`100101)*/ bytearr[count++] = (byte) (0xE0 | ((c >> 12) & 0x0F)); bytearr[count++] = (byte) (0x80 | ((c >> 6) & 0x3F)); bytearr[count++] = (byte) (0x80 | ((c >> 0) & 0x3F)); } else { /*两字节,编码规则:110xxxxx 10xxxxxx*/ bytearr[count++] = (byte) (0xC0 | ((c >> 6) & 0x1F)); bytearr[count++] = (byte) (0x80 | ((c >> 0) & 0x3F)); } } out.write(bytearr, 0, utflen+2); return utflen + 2; }

明白了写规则，然后就是读的规则了,反向理解就好。

java.io.DataInputStream.readUTF(DataInput)

public final static String readUTF(DataInput in) throws IOException {
    int utflen = in.readUnsignedShort(); byte[] bytearr = null; char[] chararr = null; if (in instanceof DataInputStream) { DataInputStream dis = (DataInputStream)in; if (dis.bytearr.length < utflen){ dis.bytearr = new byte[utflen*2]; dis.chararr = new char[utflen*2]; } chararr = dis.chararr; bytearr = dis.bytearr; } else { bytearr = new byte[utflen]; chararr = new char[utflen]; } int c, char2, char3; int count = 0; int chararr_count=0; in.readFully(bytearr, 0, utflen); while (count < utflen) { c = (int) bytearr[count] & 0xff; if (c > 127) break; count++; chararr[chararr_count++]=(char)c; } while (count < utflen) { c = (int) bytearr[count] & 0xff; switch (c >> 4) { case 0: case 1: case 2: case 3: case 4: case 5: case 6: case 7: /* 0xxxxxxx*/ count++; chararr[chararr_count++]=(char)c; break; case 12: case 13: /* 110x xxxx 10xx xxxx*/ count += 2; if (count > utflen) throw new UTFDataFormatException( "malformed input: partial character at end"); char2 = (int) bytearr[count-1]; if ((char2 & 0xC0) != 0x80) throw new UTFDataFormatException( "malformed input around byte " + count); chararr[chararr_count++]=(char)(((c & 0x1F) << 6) | (char2 & 0x3F)); break; case 14: /* 1110 xxxx 10xx xxxx 10xx xxxx */ count += 3; if (count > utflen) throw new UTFDataFormatException( "malformed input: partial character at end"); char2 = (int) bytearr[count-2]; char3 = (int) bytearr[count-1]; if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) throw new UTFDataFormatException( "malformed input around byte " + (count-1)); chararr[chararr_count++]=(char)(((c & 0x0F) << 12) | ((char2 & 0x3F) << 6) | ((char3 & 0x3F) << 0)); break; default: /* 10xx xxxx, 1111 xxxx */ throw new UTFDataFormatException( "malformed input around byte " + count); } } // The number of chars produced may be less than utflen return new String(chararr, 0, chararr_count); }