[DllImport("kernel32.dll", CallingConvention = CallingConvention.StdCall)]
private extern static bool IsBadReadPtr(void* lp, uint ucb);
private const byte kFirstBitMask = 0x80; // 1000000
private const byte kSecondBitMask = 0x40; // 0100000
private const byte kThirdBitMask = 0x20; // 0010000
private const byte kFourthBitMask = 0x10; // 0001000
private const byte kFifthBitMask = 0x08; // 0000100
public static int GetUtf8Alignment(byte character)
{
int alignment = 1;
if ((character & kFirstBitMask) > 0) // This means the first byte has a value greater than 127, and so is beyond the ASCII range.
{
if ((character & kThirdBitMask) > 0) // This means that the first byte has a value greater than 224, and so it must be at least a three-octet code point.
{
if ((character & kFourthBitMask) > 0) // This means that the first byte has a value greater than 240, and so it must be a four-octet code point.
{
alignment = 4;
}
else
{
alignment = 3;
}
}
else
{
alignment = 2;
}
}
return alignment;
}
public static int GetUtf8BufferCount(byte* s)
{
if (s == null)
{
return 0;
}
byte* i = s;
while (!IsBadReadPtr(i, 1))
{
int alignment = GetUtf8Alignment(*i);
int character = 0;
if (alignment == 1)
{
character = *i++;
}
if (alignment == 2)
{
character = *(short*)i;
i += 2;
}
else if (alignment == 3)
{
character = (*i++ | *i++ << 8 | *i++ << 16);
}
else if (alignment == 4)
{
character = *(int*)i;
i += 4;
}
if (character == 0)
{
int len = unchecked((int)(i - (s + 1)));
return len < 0 ? 0 : len;
}
}
return 0;
}
public static bool IsUTF8Buffer(byte[] buffer)
{
if (buffer == null)
{
return false;
}
fixed (byte* pinned = buffer)
{
return IsUTF8Buffer(pinned, buffer.Length);
}
}
public static bool IsUTF8Buffer(byte* buffer, int count)
{
if (buffer == null || count <= 0)
{
return false;
}
int counter = 1;
byte key = 0;
for (int i = 0; i < count; i++)
{
key = buffer[i];
if (counter == 1)
{
if (key >= 0x80)
{
while (((key <<= 1) & 0x80) != 0)
{
counter++;
}
if (counter == 1 || counter > 6)
{
return false;
}
}
}
else
{
if ((key & 0xC0) != 0x80)
{
return false;
}
counter--;
}
}
return !(counter > 1);
}
C# UTF-8字符集长度获取及判断二进制块是否UTF-8编码字符集算法的实现
最新推荐文章于 2024-07-09 11:49:28 发布