手写UTF-8编码与解码

最新推荐文章于 2024-02-22 22:22:45 发布

天天++

最新推荐文章于 2024-02-22 22:22:45 发布

阅读量1k

点赞数

分类专栏： JAVA 文章标签： java Powered by 金山文档

本文链接：https://blog.csdn.net/m0_56090275/article/details/128925294

版权

JAVA 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

1.UTF-8介绍

UTF-8，是UNICODE的一种变长字符编码，由Ken Thompson于1992年创建。现在已经标准化为RFC 3629。UTF-8用1到6个字节编码 UNICODE字符。如果UNICODE字符由2个字节表示，则编码成UTF-8很可能需要3个字节，而如果UNICODE字符由4个字节表示，则编码成 UTF-8可能需要6个字节。用4个或6个字节去编码一个UNICODE字符可能太多了，但很少会遇到那样的UNICODE字符。

UTF-8编码的优点：

UTF-8编码可以通过屏蔽位和移位操作快速读写。

字符串比较时strcmp()和wcscmp()的返回结果相同，因此使排序变得更加容易。

字节FF和FE在UTF-8编码中永远不会出现，因此他们可以用来表明UTF-16或UTF-32文本（见BOM）

UTF-8 是字节顺序无关的。它的字节顺序在所有系统中都是一样的，因此它实际上并不需要BOM。

UTF-8编码的缺点：

你无法从UNICODE字符数判断出UTF-8文本的字节数，因为UTF-8是一种变长编码

它需要用2个字节编码那些用扩展ASCII字符集只需1个字节的字符

ISO Latin-1 是UNICODE的子集，但不是UTF-8的子集

8位字符的UTF-8编码会被email网关过滤，因为internet信息最初设计为7为ASCII码。因此产生了UTF-7编码。

UTF-8 在它的表示中使用值100xxxxx的几率超过50%，而现存的实现如ISO 2022， 4873， 6429，和8859系统，会把它错认为是C1 控制码。因此产生了UTF-7.5编码。

2.实现

/**
 * 手写UTF-8编码与解码
 *
 * @author ttsuccess
 *
 */
public class Utf8Coding {
    /*
       Char. number range  |        UTF-8 octet sequence
      (hexadecimal)    |              (binary)
   --------------------+---------------------------------------------
   0000 0000-0000 007F | 0xxxxxxx
   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     */

    /**
     * 将一个字符串编码为UTF-8字节数组
     * getBytes()
     * @param str 被编码的字符串
     * @return 经过UTF-8编码以后字节数组
     */
    public static byte[] getBytes(String str) {
        //预估一下返回值的最大情况
        byte[] bytes = new byte[str.length() * 4];
        //index 代表bytes 数组中数据的存储位置
        int index = 0;
        //遍历字符串中每个字符，根据字符的Unicode编码范围，进行编码
        //将编码存储到bytes中，bytes中就是返回值UTF-8数据
        for (int i = 0; i < str.length(); i++) {
            char c = str.charAt(i);
            //根据c范围，进行编码
            //c在0-0x7F范围内，是一字节编码，1字节编码添加到bytes
            if (c <= 0x7F) {
                bytes[index++] = (byte) c;
            } else if (c <= 0x7FF) {
                //处理两字节编码  0x7F <= c <= 0x7FF
                //110xxxxx 10xxxxxx  截取字符的后6位
                int b2 = c & 0x3F | 0b10_000000;
                int b1 = (c >>> 6) & 0x1F | 0b110_00000;
                //先存b1 再存b2
                bytes[index++] = (byte) b1;
                bytes[index++] = (byte) b2;
            } else if (c <= 0xFFFF) {
                //处理三字节编码
                int b3 = c & 0x3F | 0b10_000000;
                int b2 = (c >>> 6) & 0x3F | 0b10_000000;
                int b1 = (c >>> 12) & 0xF | 0b1110_0000;
                bytes[index++] = (byte) b1;
                bytes[index++] = (byte) b2;
                bytes[index++] = (byte) b3;
            } else {
                //处理四个字节
                //11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                int b4 = c & 0x3F | 0b10_000000;
                int b3 = (c >>> 6) & 0x3F | 0b10_000000;
                int b2 = (c >>> 12) & 0x3F | 0b10_000000;
                int b1 = (c >>> 18) & 0x7 | 0b11110_000;
                bytes[index++] = (byte) b1;
                bytes[index++] = (byte) b2;
                bytes[index++] = (byte) b3;
                bytes[index++] = (byte) b4;
            }
        }
        return Arrays.copyOf(bytes, index);
    }

    /**
     * 将utf-8编码的字节数组，解码为字符串（Unicode字符）
     * @param bytes UTF-8 编码的字节
     * @return 解码以后的字符串
     */
    public static String decode(byte[] bytes) {
        char[] chs = new char[bytes.length];
        int index = 0;
        //遍历 字节 数组，检查每个字节；
        //如果以0开头，则是单字节编码
        //如果以110开头，则是双字节编码
        //如果以1110开头，则是三字节编码
        //如果以11110开头，则是四字节编码
        for (int i = 0; i < bytes.length; ) {
            int b1 = bytes[i++] & 0xff;
            // 检查是否 单字节编码
            if ((b1 >>> 7) == 0) {
                chs[index++] = (char) b1;
            } else if ((b1 >>> 5) == 0b110) {
                int b2 = bytes[i++] & 0xff;
                //消位合并
                int c = ((b1 & 0b11111) << 6) | (b2 & 0b111111);
                chs[index++] = (char) c;
            } else if ((b1 >>> 4) == 0b1110) {
                int b2 = bytes[i++] & 0xff;
                int b3 = bytes[i++] & 0xff;
                int c = ((b1 & 0b1111) << 12) | ((b2 & 0b111111) << 6) | (b3 & 0b111111);
                chs[index++] = (char) c;
            } else if ((b1 >>> 3) == 0b11110) {
                int b2 = bytes[i++] & 0xff;
                int b3 = bytes[i++] & 0xff;
                int b4 = bytes[i++] & 0xff;
                int c = ((b1 & 0b1111) << 18) | ((b2 & 0b111111) << 12) | ((b3 & 0b111111) << 6) | ((b4 & 0b111111));
                chs[index++] = (char) c;
            }

        }
        return new String(chs,0,index);
    }


    public static void main(String[] args) {
        String str = "你好，Java";
        System.out.println("Unicode:");
        for (int i = 0; i < str.length(); i++) {
            char c = str.charAt(i);
            System.out.print(c);
            System.out.print(":");
            System.out.println(Integer.toBinaryString(c));
        }

        //调用手写UTF-8 进行编码
        byte[] bytes = getBytes(str);
        for (byte b : bytes) {
            System.out.println(Integer.toBinaryString(b & 0xff));
        }

        //检查手写UTF-8 进行解码
        String s = decode(bytes);
        System.out.println(s);
    }
}