一种大数据外部排序(内存无法加载所有排序元素)、去除重复元素、快速找到随机被删除元素的BitMap小算法,核心思想即通过将一个数作为下标(index)来索引一个bit表示一个数是否存在,排序时的时间复杂度为O(N),需要的额外空间的复杂度O(N/8),支持整个int范围(正负数都支持)的算法示例如下:
char BitMask[] = {0x80 , 0x40 , 0x20 , 0x10 , 0x8 , 0x4 , 0x2 , 0x1};
int WriteNumberBitToByte(char *ByteArra , unsigned int ByteArraSize , int Number)
{
//printf("%d,%d,%d\n",(ByteArraSize * 4) - 1,-(ByteArraSize*4),Number);
if (((int)(ByteArraSize * 4) - 1) < Number || Number<-(int)(ByteArraSize*4) )
{
return 0; //failed,number out of bytearra.
}
int BaseArraBitPos = ByteArraSize *4; //ByteArraSize *8 /2
BaseArraBitPos+=Number;
printf("BaseArraBitPos=%d,Number=%d\n",BaseArraBitPos,Number);
ByteArra[BaseArraBitPos/8] |= Mask[BaseArraBitPos%8];
return 1; //success
}
int IsNumberBitInByte(char *ByteArra , unsigned int ByteArraSize , int Number)
{
if (((int)(ByteArraSize * 4) - 1) < Number || Number<-(int)(ByteArraSize*4) )
{
return 0; //failed,number out of bytearra.
}
int BaseArraBitPos = ByteArraSize *4; //ByteArraSize *8 /2
BaseArraBitPos+=Number;
if (ByteArra[BaseArraBitPos/8] & BitMask[BaseArraBitPos%8]) {
return 1;
}
return 0; //number not found.
}
void PrintOrderedBitMap(char *BitMap,unsigned int BitMapCount)
{
int MinmumNumber = -(BitMapCount*8/2);
int MaximumValue = (BitMapCount*8/2)-1;
for (int i = MinmumNumber; i <= MaximumValue; ++i)
{
if (IsNumberBitInByte(BitMap,BitMapCount,i))
{
printf("%d,", i);
}
}
printf("\n");
}
int main()
{
int Arra[] = {3,-4,2,0,-1,-8,7,-12,10};
int MaximumValue =Arra[0],MinmumValue=Arra[0];
for (int i = 0; i < sizeof(Arra)/sizeof(Arra[0]); ++i)
{
if(MaximumValue<Arra[i]) {
MaximumValue = Arra[i];
}
if (MinmumValue>Arra[i])
{
MinmumValue = Arra[i];
}
}
MaximumValue=MaximumValue<0?-MaximumValue:MaximumValue;
MinmumValue=MinmumValue<0?-MinmumValue:MinmumValue;
MaximumValue=MaximumValue>MinmumValue?MaximumValue:MinmumValue;
printf("MaximumValue=%d\n",MaximumValue);
//unsigned int BitMapCount = (MaximumValue*2+7)/8;
unsigned int BitMapCount = (MaximumValue+3)/4;
BitMapCount = BitMapCount>0?BitMapCount:1;
char *BitMap = (char*)malloc(BitMapCount);
for (int i = 0; i < sizeof(Arra)/sizeof(Arra[0]); ++i)
{
WriteNumberBitToByte(BitMap,BitMapCount,Arra[i]);
}
PrintOrderedBitMap(BitMap,BitMapCount);
}
仅支持unsigned int范围的算法示例如下:
char BitMask[] = {0x80 , 0x40 , 0x20 , 0x10 , 0x8 , 0x4 , 0x2 , 0x1};
int WriteNumberBitToByte(char *ByteArra , unsigned int ByteArraSize , unsigned int Number)
{
if (((ByteArraSize * 8) - 1) < Number )
{
return 0; //failed,number out of bytearra.
}
int BytePos = Number / 8;
int BitPos = Number % 8;
ByteArra[BytePos] |= BitMask[BitPos];
return 1; //success
}
int IsNumberBitInByte(char *ByteArra , unsigned int ByteArraSize , unsigned int Number)
{
if ((ByteArraSize * 8 - 1) < Number )
{
return 0; //failed,number out of bytearra.
}
int BytePos = Number / 8;
int BitPos = Number % 8;
if (ByteArra[BytePos] & BitMask[BitPos]) {
return 1;
}
return 0; //number not found.
}
上面的算法都是用一个bit来表示一个数,即只有2种可能,要么有,要么无,可以扩展到一个字节表示一个数,这样就可以统计出现255次范围内的重复元素,原理以此类推。
另外用bit来表示一个int数,节约了31倍的内存空间,即int(4*8),bit(8/1),所以数据量越来使用这种方式的优势越明显,前提是场景适用这种方式。