源码：处理文件格式和字符集的相关代码（3-1）-CSDN博客

	internal class Function
	{
		public static int output_format = 0;//0 不变， 1 dos， 2 unix， 3 mac
		public static int output_bom = 0;//0不变，1添加，2删除
		public static Encoding output_encoding = null;//输出编码
		public static long file_procceed_count;
		public class FileResult
		{
			public bool isText = true;//是文本文件

			public int count_CR = 0;
			public int count_CRLF = 0;
			public int count_LF = 0;

			public bool isASCII = false;//是纯ASCII文本（1-127）
			public bool isLocal = false;//是本地编码文本（没有0）
			public bool isUTF8 = false;//是UTC-8（没有0，可能带BOM）
			public bool isUTF16 = false;//是UTC-16（可能带BOM）
			public bool isUTF32 = false;//是UTC-32（可能带BOM）

			public bool withBOM = false;//是否带BOM（所有UNICODE格式都可能带有BOM）
			public bool isBigEndian = false;//windows默认LE
			public string BOM = "";

			public string status = "";//处理状态

			public string ShowType()
			{
				StringBuilder sb = new StringBuilder();
				if (isText)
				{
					sb.Append("TEXT");
					if (0 != count_LF) sb.Append("_UNIX");
					if (0 != count_CRLF) sb.Append("_DOS");
					if (0 != count_CR) sb.Append("_MAC");
				}
				else
				{
					sb.Append("Binary");
				}
				return sb.ToString();
			}
			public string ShowBOM()
			{
				return withBOM ? BOM : "";
			}
			public string ShowEncode()
			{
				if (!isText) return "";
				else
				{
					StringBuilder sb = new StringBuilder();

					if (isASCII) sb.Append("ASCII");
					if (isLocal) sb.Append("LOCAL");
					if (isUTF8) sb.Append("UTF-8");
					if (isUTF16) sb.Append("UTF16-" + (isBigEndian ? "BE" : "LE"));
					if (isUTF32) sb.Append("UTF32-" + (isBigEndian ? "BE" : "LE"));
					return sb.ToString();
				}
			}
		}
		//检查是否是文本文件，返回是否是文本文件
		public static bool checkUTF(byte[] data, int charWidth, bool bBE, ref int count_CR, ref int count_CRLF, ref int count_LF);

		//检查是否是utf-8，不包含BOM，只检查是否符合utf-8编码规则
		public static bool checkUTF8(byte[] data, int start);

        //判断是否是指定格式的字符（判断utf-16 utf-32的回车换行）
		public static bool IsTheChar(byte[] data, int charWidth, int thePos, int start, char theChar);

        //处理一个文件
		public static FileResult ProcessFile(string file, bool checkonly);

        //生成文件预览
		public static string makeFileContent(string file, bool bHex, Encoding encoding, ref string state, ref Encoding realEncoding);
	}

前面是三个输出控制参数和一个文件数统计，然后是FileResult结构，记录文件信息（是否是文本文件、是哪一种文本文件、有没有BOM、字节序、回车换行统计）。

再往后有三个辅助函数，最后两个是入口函数，ProcessFile检测并转换文件，makeFileContent显示文件预览。

二、判断是否是utf-16、utf-32的回车换行

主要用来检查是否是回车换行或0。对于utif-16和utf-32，回车换行是不同的表达方式，比如“00 0d”和“0d 00”的不同。

		public static bool IsTheChar(byte[] data, int charWidth, int thePos, int start, char theChar)
		{
			int sum = 0;
			for (int j = 0; j < charWidth; ++j)
			{
				sum += data[start + j];
			}
			if (sum == theChar && data[start + thePos] == theChar)
			{
				return true;
			}
			return false;
		}

这个函数的参数：

data 文件数据
charWidth 字符宽度，1、2、4，你懂吧
thePos 要判断的字节在字符中的位置，字节序挺马拐的，反正逻辑是isBigEndian ? charWidth - 1 : 0
start 数据起始位置
theChar 要判断的字符

这个函数的逻辑就是先把data的start开始的charWidth个字符加起来，如果和等于要判断的字符并且thePos位置的字节也等于要判断的字符就返回true。

其实就是判断其余几个字节必须是0。

三、检查是否是UTF16或UTF-32

判断逻辑为不能包含0，同时会统计回车换行数量。

		public static bool checkUTF(byte[] data, int charWidth, bool bBE, ref int count_CR, ref int count_CRLF, ref int count_LF)
		{
			if (0 != data.Length % charWidth) return false;

			count_CR = 0;
			count_CRLF = 0;
			count_LF = 0;

			int pos = (bBE ? charWidth - 1 : 0);
			for (int i = 0; i < data.Length; i += charWidth)
			{
				if (IsTheChar(data, charWidth, pos, i, '\0'))
				{
					return false;
				}
				else if (IsTheChar(data, charWidth, pos, i, '\r') && i + charWidth < data.Length && IsTheChar(data, charWidth, pos, i + charWidth, '\n'))
				{
					++count_CRLF;
					i += charWidth;
				}
				else if (IsTheChar(data, charWidth, pos, i, '\r'))
				{
					++count_CR;
				}
				else if (IsTheChar(data, charWidth, pos, i, '\n'))
				{
					++count_LF;
				}
			}
			return true;
		}

参数：

data 文件数据，不包含BOM
charWidth 字符宽度，2、4，也能用于utf-8，但是utf-8有专门的判断方法
bBE 是否是BigEndian
count_ 回车换行统计

四、检测UTF-8

UTF-8是有编码规则的，下面的函数判断是否所有字节都符合编码规则。

		public static bool checkUTF8(byte[] data, int start)
		{
			for (int i = start; i < data.Length; ++i)
			{
				if (0 == data[i]) return false;
				if (data[i] <= 127) continue;

				int charBytes = 0;
				if (0 == ((data[i] & 0b1110_0000) ^ 0b1100_0000)) charBytes = 2;
				else if (0 == ((data[i] & 0b1111_0000) ^ 0b1110_0000)) charBytes = 3;
				else if (0 == ((data[i] & 0b1111_1000) ^ 0b1111_0000)) charBytes = 4;
				else if (0 == ((data[i] & 0b1111_1100) ^ 0b1111_1000)) charBytes = 5;
				else if (0 == ((data[i] & 0b1111_1110) ^ 0b1111_1100)) charBytes = 6;
				else if (0 == ((data[i] & 0b1111_1111) ^ 0b1111_1110)) charBytes = 7;
				else return false;

				if (i + charBytes - 1 >= data.Length) return false;

				for (int j = 0; j < charBytes - 1; ++j)
				{
					if (0 != ((data[i + j + 1] & 0b1100_0000) ^ 0b1000_0000)) return false;
				}
				i += charBytes - 1;
			}
			return true;
		}

参数start用于跳过BOM（UTF-8的BOM是三个字节，UNIX上默认是没有的）。