手工编写UTF-8编码和解码

最新推荐文章于 2023-04-10 10:03:01 发布

FromZero_ZC

最新推荐文章于 2023-04-10 10:03:01 发布

阅读量409

点赞数

文章标签： java

本文链接：https://blog.csdn.net/m0_65470619/article/details/123597894

版权

/**
 * 手工编写UTF-8编码
 *
 * @author Bearing
 * @create 2022年 03月 19日
 */
public class Utf8Coding {
    /*
       Char. number range  |        UTF-8 octet sequence
      (hexadecimal)    |              (binary)
   --------------------+---------------------------------------------
   0000 0000-0000 007F | 0xxxxxxx
   0000 0080-0000 07FF | 110xxxxx 10xxxxxx
   0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
     */

    /**
     * 将一个字符串编码为UTF-8字节数组
     * getBytes()
     *
     * @param str 被编码的字符串
     * @return 经过UTF-8编码以后字节数组
     */
    public static byte[] getBytes(String str) {
        //预估一下返回值的最大情况
        byte[] bytes = new byte[str.length() * 4];
        //index 代表bytes 数组中数据的存储位置
        int index = 0;
        //遍历字符串中每个字符，根据字符的Unicode编码范围，进行编码
        //将编码存储到bytes中，bytes中就是返回值UTF-8数据
        for (int i = 0; i < str.length(); i++) {
            char c = str.charAt(i);
            //根据c范围，进行编码
            //c在0-0x7F范围内，是一字节编码，1字节编码添加到bytes
            if (c <= 0x7F) {
                bytes[index++] = (byte) c;
            } else if (c <= 0x7FF) {
                //处理两字节编码  0x7F <= c <= 0x7FF
                //110xxxxx 10xxxxxx  截取字符的后6位
                int b2 = c & 0x3F | 0b10_000000;
                int b1 = (c >>> 6) & 0x1F | 0b110_00000;
                //先存b1 再存b2
                bytes[index++] = (byte) b1;
                bytes[index++] = (byte) b2;
            } else if (c <= 0xFFFF) {
                //处理三字节编码
                int b3 = c & 0x3F | 0b10_000000;
                int b2 = (c >>> 6) & 0x3F | 0b10_000000;
                int b1 = (c >>> 12) & 0xF | 0b1110_0000;
                bytes[index++] = (byte) b1;
                bytes[index++] = (byte) b2;
                bytes[index++] = (byte) b3;
            } else {
                //处理四个字节
                //11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
                int b4 = c & 0x3F | 0b10_000000;
                int b3 = (c >>> 6) & 0x3F | 0b10_000000;
                int b2 = (c >>> 12) & 0x3F | 0b10_000000;
                int b1 = (c >>> 18) & 0x7 | 0b11110_000;
                bytes[index++] = (byte) b1;
                bytes[index++] = (byte) b2;
                bytes[index++] = (byte) b3;
                bytes[index++] = (byte) b4;
            }
        }
        return Arrays.copyOf(bytes, index);
    }

    /**
     * 将utf-8编码的字节数组，解码为字符串（Unicode字符）
     *
     * @param bytes UTF-8 编码的字节
     * @return 解码以后的字符串
     */
    public static String decode(byte[] bytes) {
        char[] chs = new char[bytes.length];
        int index = 0;
        //遍历 字节 数组，检查每个字节；
        //如果以0开头，则是单字节编码
        //如果以110开头，则是双字节编码
        //如果以1110开头，则是三字节编码
        //如果以11110开头，则是四字节编码
        for (int i = 0; i < bytes.length; ) {
            int b1 = bytes[i++] & 0xff;
            // 检查是否 单字节编码
            if ((b1 >>> 7) == 0) {
                chs[index++] = (char) b1;
            } else if ((b1 >>> 5) == 0b110) {
                int b2 = bytes[i++] & 0xff;
                //消位合并
                int c = ((b1 & 0b11111) << 6) | (b2 & 0b111111);
                chs[index++] = (char) c;
            } else if ((b1 >>> 4) == 0b1110) {
                int b2 = bytes[i++] & 0xff;
                int b3 = bytes[i++] & 0xff;
                int c = ((b1 & 0b1111) << 12) | ((b2 & 0b111111) << 6) | (b3 & 0b111111);
                chs[index++] = (char) c;
            } else if ((b1 >>> 3) == 0b11110) {
                int b2 = bytes[i++] & 0xff;
                int b3 = bytes[i++] & 0xff;
                int b4 = bytes[i++] & 0xff;
                int c = ((b1 & 0b1111) << 18) | ((b2 & 0b111111) << 12) | ((b3 & 0b111111) << 6) | ((b4 & 0b111111));
                chs[index++] = (char) c;
            }

        }
        return new String(chs,0,index);
    }


    public static void main(String[] args) {
        String str = "Javaλ表达式";
        System.out.println("Unicode:");
        for (int i = 0; i < str.length(); i++) {
            char c = str.charAt(i);
            System.out.print(c);
            System.out.print(":");
            System.out.println(Integer.toBinaryString(c));
        }

        //调用手写UTF-8 进行编码
        byte[] bytes = getBytes(str);
        for (byte b : bytes) {
            System.out.println(Integer.toBinaryString(b & 0xff));
        }

        //检查手写UTF-8 进行解码
        String s = decode(bytes);
        System.out.println(s);
    }
}