BOM —— 字节顺序标记(Byte Order Mark)
BOM标记的作用是告诉编辑器当前文件使用的编码方式,方便编辑器识别,一般编辑器中不会显示这个标记,但是这个标记是占用了几个字节的空间。
一个字节有"大字节序"(BE)和"小字节序"(LE)的区别,比如0x2138是两个字节的字符,0x21和0x38,计算机处理字节序的时候,不知道什么是高位字节,什么是低位字节。它只知道按顺序读取字节,先读0x21,再读0x38,无论大序还是小序,物理顺序上是一个方向读取的。
如果是大端字节序,先读到的就是高位字节,后读到的就是低位字节,即0x21高位而0x38低位,小端字节序正好相反0x21低位0x38高位,这个是根据不同CPU的喜好决定的。
至于为什么使用0xFE、0xFF来作为标记,看了网上的说法是:在Unicode编码中有一个叫做”ZEROWIDTH NO-BREAK SPACE“的字符(65279),它的编码是FEFF。而FFFE在Unicode中是空白字符,所以这两个字符是没有实际意义的
BOM最大占4个字节,和编码方式有关系:
编码(+BOM) | 占用字节数 | bom字节顺序 |
| 3 | |
| 2 | |
| 2 | |
| 4 | |
| 4 | |
BOM标记位于文件起始的几个字节,读取一个带BOM标记的数据流时,只需要获取到前4个字节,即可判断出使用的哪种BOM标记,也可以顺便判断出文件使用的是哪种编码。下面是一个读取BOM标记的工具类:
import java.io.*;
/**
* @author lang.zhou
* @date 2020/1/1
* 读取带bom的数据流,并获取bom标记
*/
public class UnicodeReader extends Reader{
//字节流的编码
private String encode = null;
private InputStreamReader reader = null;
//bom标记不会超过4个字节
private static final int BOM_SIZE = 4;
/**
* 是否跳过bom标记字节
*/
private boolean removeBom = true;
/**
* bom标记占的字节数
*/
private int bomCount = 0;
private byte[] bom = null;
private String getDefaultEnc(){
return System.getProperty("sun.jnu.encoding");
}
public UnicodeReader(InputStream in, String defaultEnc) throws IOException {
init(in,defaultEnc);
}
public UnicodeReader(InputStream in, boolean removeBom) throws IOException {
this.removeBom = removeBom;
init(in,getDefaultEnc());
}
/**
*
* @param in 输入流
* @param defaultEnc 默认编码
* @param removeBom 是否跳过bom标记读取,指针指向
*/
public UnicodeReader(InputStream in, String defaultEnc,boolean removeBom) throws IOException {
this.removeBom = removeBom;
init(in,defaultEnc);
}
public UnicodeReader(InputStream in) throws IOException {
init(in,getDefaultEnc());
}
/**
* 获得流的编码
*/
public String getEncoding() {
return encode;
}
/**
* 读取字节流的bom标记
*/
protected void init(InputStream in, String defaultEnc) throws IOException {
PushbackInputStream internalIn = new PushbackInputStream(in, BOM_SIZE);
byte bom[] = new byte[BOM_SIZE];
int n,
//要退回到流的字节数量
unread;
n = internalIn.read(bom, 0, bom.length);
if ((bom[0] == (byte) 0x00) && (bom[1] == (byte) 0x00)
&& (bom[2] == (byte) 0xFE) && (bom[3] == (byte) 0xFF)) {
encode = "UTF-32BE";
unread = n - 4;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)
&& (bom[2] == (byte) 0x00) && (bom[3] == (byte) 0x00)) {
encode = "UTF-32LE";
unread = n - 4;
}else if ((bom[0] == (byte) 0xEF) && (bom[1] == (byte) 0xBB)
&& (bom[2] == (byte) 0xBF)) {
encode = "UTF-8";
unread = n - 3;
}else if ((bom[0] == (byte) 0xFE) && (bom[1] == (byte) 0xFF)) {
encode = "UTF-16BE";
unread = n - 2;
} else if ((bom[0] == (byte) 0xFF) && (bom[1] == (byte) 0xFE)) {
encode = "UTF-16LE";
unread = n - 2;
} else {
encode = defaultEnc;
unread = n;
}
if(!this.removeBom){
internalIn.unread(bom);
}else{
if (unread > 0) {
internalIn.unread(bom, (n - unread), unread);
}
}
if (encode == null) {
reader = new InputStreamReader(internalIn);
} else {
reader = new InputStreamReader(internalIn, encode);
}
//bom标记的位数
this.bomCount = 4-unread;
this.bom = new byte[this.bomCount];
System.arraycopy(bom,0,this.bom,0,this.bomCount);
}
public int getBomCount() {
return bomCount;
}
public byte[] getBom() {
return bom;
}
public int read(char c[], int offset, int length) throws IOException {
return reader.read(c, offset, length);
}
@Override
public void close() throws IOException {
if(reader != null){
reader.close();
}
}
}
下面是输出带BOM数据流的工具类:
import java.io.*;
import java.util.HashMap;
import java.util.Map;
/**
* @author lang.zhou
* @date 2020/1/1
* 输出带bom的数据流,并获取bom标记
*/
public class UnicodeWriter extends Writer{
//字节流的编码
private String encode = null;
private OutputStreamWriter writer = null;
private static final Map<String,byte[]> BOM = initMap();
/**
* bom标记占的字节数
*/
private int bomCount = 0;
private byte[] bom = null;
private String getDefaultEnc(){
return System.getProperty("sun.jnu.encoding");
}
private static Map<String,byte[]> initMap(){
Map<String,byte[]> m = new HashMap(5);
m.put("UTF-8",new byte[]{(byte) 0xEF, (byte) 0xBB, (byte) 0xBF});
m.put("UTF-16BE",new byte[]{(byte) 0xFE, (byte) 0xFF});
m.put("UTF-16LE",new byte[]{(byte) 0xFF, (byte) 0xFE});
m.put("UTF-32BE",new byte[]{0x00, 0x00, (byte) 0xFE, (byte) 0xFF});
m.put("UTF-32LE",new byte[]{(byte) 0xFF, (byte) 0xFE, 0x00, 0x00});
return m;
}
public UnicodeWriter(OutputStream out, String defaultEnc) throws IOException {
init(out,defaultEnc);
}
public UnicodeWriter(OutputStream out) throws IOException {
init(out,getDefaultEnc());
}
/**
* 获得流的编码
*/
public String getEncoding() {
return encode;
}
public int getBomCount() {
return bomCount;
}
public byte[] getBom() {
return bom;
}
/**
* 写入bom标记
*/
protected void init(OutputStream out,String encoding) throws IOException {
if(encoding != null){
this.encode = encoding.toUpperCase();
byte[] bm = BOM.get(encoding);
this.bom = bm;
out.write(bm);
bomCount = bm.length;
}
writer = new OutputStreamWriter(out,encode);
}
@Override
public void write(char[] c, int off, int len) throws IOException {
writer.write(c,off,len);
}
@Override
public void flush() throws IOException {
writer.flush();
}
@Override
public void close() throws IOException {
if(writer != null){
writer.close();
}
}
}
测试工具:
public static void main(String[] args) throws Exception {
//输出一个带bom的文件
FileOutputStream out = new FileOutputStream("C:\\Users\\Administrator\\Desktop\\1.txt");
UnicodeWriter writer = new UnicodeWriter(out,"UTF-8");
writer.write("哈哈哈");
writer.flush();
writer.close();
out.flush();
out.close();
}
用Notepad++打开文件后:
编辑器识别出了bom标记,并且没有显示bom标记
public static void main(String[] args) throws Exception {
//读取bom,默认跳过bom标记,从后面的字节开始读取
UnicodeReader re = new UnicodeReader(new FileInputStream("C:\\Users\\Administrator\\Desktop\\1.txt"));
re.getBom();
char[] cf = new char[3];
re.read(cf);
System.out.println(new String(cf));
}
控制台输出:
正确的读取并跳过了BOM标记