Java处理带BOM标记的数据流

最新推荐文章于 2021-07-20 15:49:39 发布

不恋水的雨

最新推荐文章于 2021-07-20 15:49:39 发布

阅读量429

点赞数 1

分类专栏： java 文章标签： BOM Java

本文链接：https://blog.csdn.net/qq_36635569/article/details/103908060

版权

java 专栏收录该内容

24 篇文章 0 订阅

订阅专栏

BOM —— 字节顺序标记(Byte Order Mark)

BOM标记的作用是告诉编辑器当前文件使用的编码方式，方便编辑器识别，一般编辑器中不会显示这个标记，但是这个标记是占用了几个字节的空间。

一个字节有"大字节序"(BE)和"小字节序"(LE)的区别，比如0x2138是两个字节的字符，0x21和0x38，计算机处理字节序的时候，不知道什么是高位字节，什么是低位字节。它只知道按顺序读取字节，先读0x21，再读0x38，无论大序还是小序，物理顺序上是一个方向读取的。

如果是大端字节序，先读到的就是高位字节，后读到的就是低位字节，即0x21高位而0x38低位，小端字节序正好相反0x21低位0x38高位，这个是根据不同CPU的喜好决定的。

至于为什么使用0xFE、0xFF来作为标记，看了网上的说法是：在Unicode编码中有一个叫做”ZEROWIDTH NO-BREAK SPACE“的字符(65279)，它的编码是FEFF。而FFFE在Unicode中是空白字符，所以这两个字符是没有实际意义的

BOM最大占4个字节，和编码方式有关系：

编码(+BOM)	占用字节数	bom字节顺序
`UTF-8`	3	`(byte) 0xEF, (byte) 0xBB, (byte) 0xBF`
`UTF-16BE`	2	`(byte) 0xFE, (byte) 0xFF`
`UTF-16LE`	2	`(byte) 0xFF, (byte) 0xFE`
`UTF-32BE`	4	`(byte) 0x00, (byte) 0x00, (byte) 0xFE, (byte) 0xFF`
`UTF-32LE`	4	`(byte) 0xFF, (byte) 0xFE, (byte) 0x00, (byte) 0x00`

BOM标记位于文件起始的几个字节，读取一个带BOM标记的数据流时，只需要获取到前4个字节，即可判断出使用的哪种BOM标记，也可以顺便判断出文件使用的是哪种编码。下面是一个读取BOM标记的工具类：

import java.io.*;

/**
 * @author lang.zhou
 * @date 2020/1/1
 * 读取带bom的数据流，并获取bom标记
 */
public class UnicodeReader extends Reader{
    //字节流的编码
    private String encode = null;
    private InputStreamReader reader  = null;
    //bom标记不会超过4个字节
    private static final int BOM_SIZE = 4;
    /**
     * 是否跳过bom标记字节
     */
    private boolean removeBom = true;
    /**
     * bom标记占的字节数
     */
    private int bomCount = 0;

    private byte[] bom = null;
    private String getDefaultEnc(){
        return System.getProperty("sun.jnu.encoding");
    }
    public UnicodeReader(InputStream in, String defaultEnc) throws IOException {
        init(in,defaultEnc);
    }
    public UnicodeReader(InputStream in, boolean removeBom) throws IOException {
        this.removeBom = removeBom;
        init(in,getDefaultEnc());
    }

    /**
     *
     * @param in            输入流
     * @param defaultEnc    默认编码
     * @param removeBom     是否跳过bom标记读取，指针指向
     */
    public UnicodeReader(InputStream in, String defaultEnc,boolean removeBom) throws IOException {
        this.removeBom = removeBom;
        init(in,defaultEnc);
    }
    public UnicodeReader(InputStream in) throws IOException {
        init(in,getDefaultEnc());
    }

    /**
     * 获得流的编码
     */
    public String getEncoding() {

        return encode;
    }

    /**
     * 读取字节流的bom标记
     */
    protected void init(InputStream in, String defaultEnc) throws IOException {
        PushbackInputStream internalIn = new PushbackInputStream(in, BOM_SIZE);
        byte bom[] = new byte[BOM_SIZE];
        int n,
                //要退回到流的字节数量
                unread;
        n = internalIn.read(bom, 0, bom.length);
        if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
                && (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
            encode = "UTF-32BE";
            unread = n - 4;
        } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
                && (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
            encode = "UTF-32LE";
            unread = n - 4;
        }else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
                && (bom[2] == (byte) 0xBF)) {
            encode = "UTF-8";
            unread = n - 3;
        }else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
            encode = "UTF-16BE";
            unread = n - 2;
        } else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
            encode = "UTF-16LE";
            unread = n - 2;
        } else {
           
            encode = defaultEnc;
            unread = n;
        }
        if(!this.removeBom){
            internalIn.unread(bom);
        }else{
            if (unread > 0) {
                internalIn.unread(bom, (n - unread), unread);
            }
        }
        if (encode == null) {
            reader = new InputStreamReader(internalIn);
        } else {
            reader = new InputStreamReader(internalIn, encode);
        }
        //bom标记的位数
        this.bomCount = 4-unread;
        this.bom = new byte[this.bomCount];
        System.arraycopy(bom,0,this.bom,0,this.bomCount);
    }

    public int getBomCount() {
        return bomCount;
    }
    public byte[] getBom() {
        return bom;
    }
    public int read(char c[], int offset, int length) throws IOException {
        return reader.read(c, offset, length);
    }

    @Override
    public void close() throws IOException {
        if(reader != null){
            reader.close();
        }
    }
}

下面是输出带BOM数据流的工具类：

import java.io.*;
import java.util.HashMap;
import java.util.Map;

/**
 * @author lang.zhou
 * @date 2020/1/1
 * 输出带bom的数据流，并获取bom标记
 */
public class UnicodeWriter extends Writer{
    //字节流的编码
    private String encode = null;
    private OutputStreamWriter writer  = null;
    private static final Map<String,byte[]> BOM = initMap();

    /**
     * bom标记占的字节数
     */
    private int bomCount = 0;

    private byte[] bom = null;
    private String getDefaultEnc(){
        return System.getProperty("sun.jnu.encoding");
    }
    private static Map<String,byte[]> initMap(){
        Map<String,byte[]> m = new HashMap(5);
        m.put("UTF-8",new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF});
        m.put("UTF-16BE",new byte[]{(byte) 0xFE, (byte) 0xFF});
        m.put("UTF-16LE",new byte[]{(byte) 0xFF, (byte) 0xFE});
        m.put("UTF-32BE",new byte[]{0x00, 0x00, (byte) 0xFE, (byte) 0xFF});
        m.put("UTF-32LE",new byte[]{(byte) 0xFF, (byte) 0xFE, 0x00, 0x00});
        return m;
    }
    public UnicodeWriter(OutputStream out, String defaultEnc) throws IOException {
        init(out,defaultEnc);
    }

    public UnicodeWriter(OutputStream out) throws IOException {
        init(out,getDefaultEnc());
    }

    /**
     * 获得流的编码
     */
    public String getEncoding() {

        return encode;
    }

    public int getBomCount() {
        return bomCount;
    }

    public byte[] getBom() {
        return bom;
    }

    /**
     * 写入bom标记
     */
    protected void init(OutputStream out,String encoding) throws IOException {
        if(encoding != null){
            this.encode = encoding.toUpperCase();
            byte[] bm = BOM.get(encoding);
            this.bom = bm;
            out.write(bm);
            bomCount = bm.length;
        }
        writer = new OutputStreamWriter(out,encode);
    }


    @Override
    public void write(char[] c, int off, int len) throws IOException {
        writer.write(c,off,len);
    }

    @Override
    public void flush() throws IOException {
        writer.flush();
    }

    @Override
    public void close() throws IOException {
        if(writer != null){
            writer.close();
        }
    }
}

测试工具：

public static void main(String[] args) throws Exception {
    //输出一个带bom的文件
    FileOutputStream out = new FileOutputStream("C:\\Users\\Administrator\\Desktop\\1.txt");
    UnicodeWriter writer = new UnicodeWriter(out,"UTF-8");
    writer.write("哈哈哈");
    writer.flush();
    writer.close();
    out.flush();
    out.close();
}

用Notepad++打开文件后：

编辑器识别出了bom标记，并且没有显示bom标记

public static void main(String[] args) throws Exception {
    //读取bom，默认跳过bom标记，从后面的字节开始读取
    UnicodeReader re = new UnicodeReader(new FileInputStream("C:\\Users\\Administrator\\Desktop\\1.txt"));
    re.getBom();
    char[] cf = new char[3];
    re.read(cf);
    System.out.println(new String(cf));
}

控制台输出：

正确的读取并跳过了BOM标记

不恋水的雨

关注

1
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Java处理带BOM标记的数据流

BOM —— 字节顺序标记(Byte Order Mark)BOM标记的作用是告诉编辑器当前文件使用的编码方式，方便编辑器识别，一般编辑器中不会显示这个标记，但是这个标记是占用了几个字节的空间。一个字节有"大字节序"(BE)和"小字节序"(LE)的区别，比如0x2138是两个字节的字符，0x21和0x38，计算机处理字节序的时候，不知道什么是高位字节，什么是低位字节。它只知道按顺序读...
复制链接

扫一扫