/** * 手工编写UTF-8编码 * * @author Bearing * @create 2022年 03月 19日 */ public class Utf8Coding { /* Char. number range | UTF-8 octet sequence (hexadecimal) | (binary) --------------------+--------------------------------------------- 0000 0000-0000 007F | 0xxxxxxx 0000 0080-0000 07FF | 110xxxxx 10xxxxxx 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx 0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ /** * 将一个字符串编码为UTF-8字节数组 * getBytes() * * @param str 被编码的字符串 * @return 经过UTF-8编码以后字节数组 */ public static byte[] getBytes(String str) { //预估一下返回值的最大情况 byte[] bytes = new byte[str.length() * 4]; //index 代表bytes 数组中数据的存储位置 int index = 0; //遍历字符串中每个字符,根据字符的Unicode编码范围,进行编码 //将编码存储到bytes中,bytes中就是返回值UTF-8数据 for (int i = 0; i < str.length(); i++) { char c = str.charAt(i); //根据c范围,进行编码 //c在0-0x7F范围内,是一字节编码,1字节编码添加到bytes if (c <= 0x7F) { bytes[index++] = (byte) c; } else if (c <= 0x7FF) { //处理两字节编码 0x7F <= c <= 0x7FF //110xxxxx 10xxxxxx 截取字符的后6位 int b2 = c & 0x3F | 0b10_000000; int b1 = (c >>> 6) & 0x1F | 0b110_00000; //先存b1 再存b2 bytes[index++] = (byte) b1; bytes[index++] = (byte) b2; } else if (c <= 0xFFFF) { //处理三字节编码 int b3 = c & 0x3F | 0b10_000000; int b2 = (c >>> 6) & 0x3F | 0b10_000000; int b1 = (c >>> 12) & 0xF | 0b1110_0000; bytes[index++] = (byte) b1; bytes[index++] = (byte) b2; bytes[index++] = (byte) b3; } else { //处理四个字节 //11110xxx 10xxxxxx 10xxxxxx 10xxxxxx int b4 = c & 0x3F | 0b10_000000; int b3 = (c >>> 6) & 0x3F | 0b10_000000; int b2 = (c >>> 12) & 0x3F | 0b10_000000; int b1 = (c >>> 18) & 0x7 | 0b11110_000; bytes[index++] = (byte) b1; bytes[index++] = (byte) b2; bytes[index++] = (byte) b3; bytes[index++] = (byte) b4; } } return Arrays.copyOf(bytes, index); } /** * 将utf-8编码的字节数组,解码为字符串(Unicode字符) * * @param bytes UTF-8 编码的字节 * @return 解码以后的字符串 */ public static String decode(byte[] bytes) { char[] chs = new char[bytes.length]; int index = 0; //遍历 字节 数组,检查每个字节; //如果以0开头,则是单字节编码 //如果以110开头,则是双字节编码 //如果以1110开头,则是三字节编码 //如果以11110开头,则是四字节编码 for (int i = 0; i < bytes.length; ) { int b1 = bytes[i++] & 0xff; // 检查是否 单字节编码 if ((b1 >>> 7) == 0) { chs[index++] = (char) b1; } else if ((b1 >>> 5) == 0b110) { int b2 = bytes[i++] & 0xff; //消位合并 int c = ((b1 & 0b11111) << 6) | (b2 & 0b111111); chs[index++] = (char) c; } else if ((b1 >>> 4) == 0b1110) { int b2 = bytes[i++] & 0xff; int b3 = bytes[i++] & 0xff; int c = ((b1 & 0b1111) << 12) | ((b2 & 0b111111) << 6) | (b3 & 0b111111); chs[index++] = (char) c; } else if ((b1 >>> 3) == 0b11110) { int b2 = bytes[i++] & 0xff; int b3 = bytes[i++] & 0xff; int b4 = bytes[i++] & 0xff; int c = ((b1 & 0b1111) << 18) | ((b2 & 0b111111) << 12) | ((b3 & 0b111111) << 6) | ((b4 & 0b111111)); chs[index++] = (char) c; } } return new String(chs,0,index); } public static void main(String[] args) { String str = "Javaλ表达式"; System.out.println("Unicode:"); for (int i = 0; i < str.length(); i++) { char c = str.charAt(i); System.out.print(c); System.out.print(":"); System.out.println(Integer.toBinaryString(c)); } //调用手写UTF-8 进行编码 byte[] bytes = getBytes(str); for (byte b : bytes) { System.out.println(Integer.toBinaryString(b & 0xff)); } //检查手写UTF-8 进行解码 String s = decode(bytes); System.out.println(s); } }
手工编写UTF-8编码和解码
最新推荐文章于 2023-04-10 10:03:01 发布