索引查找表测试代码 zb++ 2007-6-9 知识共享 责任自负
索引查找表CMatchTable2
测试用类CTemp,包含两个long,均作为关键字
100万数据量(0,0)-(999999,999999)
索引查找表所需的内存:
节点集合数(nodes) 8003930 (每个是一个vector,大小为3或4个指针的大小,测试机为32字节)
元素数(CNode) 9003928 (2个int 1个char 2个bool ,一般为12字节)
而原始数据为16字节(64位机),索引表的大小大约为原始数组的23倍
100万数据逐个查找一遍
索引查找表 2.28 秒
vector lower_bound 0.34 秒
代码中thelog可替换为cout,endi、ende替换为endl
CMatchTable2实现为存储变长的字节串,有两组Insert和isMatch,一组对应字节串,一组专门对整数作了接口简化,CMatchTable2构造函数可以指定使用更短的整数
key_size仅用于整数转换,因此两组接口可以交叉调用但实际使用中这样做不合理
clear未完全实现
vector<CNode >通一放在nodes(用了一个索引位置来关联)而不是直接放在CNode里面是为了特殊目的考虑的,一般把vsubindex换为vector<CNode > sub,去掉通过vsubindex获得sub的代码即可
以下为代码:
class CMatchTable2
{
public:
class CNode
{
private:
bool isUsed;
bool isIncludeFinish;
unsigned char base;//子项位置的偏移
unsigned int vsubindex;//子项vector在vector表的索引,节点0永远存在,代表初始值,不能被修改
int exdata;//如果isIncludeFinish为true则这里放对应的数据
void _Insert(char const * key,long size,int data,vector<vector<CNode > > & nodes)
{
//cout<<__LINE__<<" "<<str<<" "<<c<<endl;
isUsed=true;
if(0==size)
{
isIncludeFinish=true;
exdata=data;
//cout<<__LINE__<<" "<<base<<" "<<sub.size()<<" "<<isUsed<<" "<<isIncludeFinish<<endl;
}
else
{
if(0==vsubindex)
{
nodes.reserve(nodes.size()+1);
nodes.resize(nodes.size()+1);
vsubindex=nodes.size()-1;
}
vector<CNode > & sub=nodes[vsubindex];
long c=(unsigned char)key[0];
if(sub.size()==0)
{
sub.reserve(1);
sub.resize(1);
base=c;
}
else if(c<base)
{
CNode tmp;
sub.reserve(sub.size()+base-c);
sub.insert(sub.begin(),base-c,tmp);
base=c;
}
else if(c>=base+sub.size())
{
CNode tmp;
sub.reserve(c-base);
sub.insert(sub.end(),c-base-sub.size()+1,tmp);
}
else
{
}
//cout<<__LINE__<<" "<<base<<" "<<sub.size()<<" "<<isUsed<<" "<<isIncludeFinish<<endl;
sub[c-base]._Insert(key+1,size-1,data,nodes);
}
}
bool _isMatch(char const * key,int size,bool isFull,int & matchlen,int & data,vector<vector<CNode > > const & nodes)const
{
if(0==size)
{
data=exdata;
return this->isIncludeFinish;
}
else
{
vector<CNode > const & sub=nodes[vsubindex];
long c=(unsigned char)key[0];
if(!isFull && this->isIncludeFinish)
{
data=exdata;
return true;
}
if(c-base<sub.size() && sub[c-base].isUsed)
{
++matchlen;
return sub[c-base]._isMatch(key+1,size-1,isFull,matchlen,data,nodes);
}
}
return false;
}
public:
CNode(){clear();}
bool isItemFinish()const{return isUsed && isIncludeFinish;}
void clear()
{
vsubindex=0;
isUsed=false;
isIncludeFinish=false;
exdata=0;
base=0;
}
void Insert(char const * key,int size,int data,vector<vector<CNode > > & nodes)
{
_Insert(key,size,data,nodes);
}
void Print()const
{
thelog<<isUsed<<" "<<isIncludeFinish<<" "<<exdata<<" "<<base<<" "<<vsubindex<<endi;
}
void Print(vector<vector<CNode > > const & nodes,int level=0)const
{
string head;
head.assign(level,' ');
if(!isUsed)
{
thelog<<head<<"no used"<<endi;
return;
}
if(isIncludeFinish)thelog<<head<<" : "<<exdata<<endi;
//if(0==vsubindex)return;
vector<CNode > const & sub=nodes[vsubindex];
for(int i=0;i<sub.size();++i)
{
if(sub[i].isUsed)
{
thelog<<head<<base+i<<endi;
sub[i].Print(nodes,level+1);
}
}
}
bool isMatch(char const * key,int size,bool isFull,int * pmatchlen,int * pdata,vector<vector<CNode > > const & nodes)const
{
int len;
int data;
int * p1;
int * p2;
if(NULL!=pmatchlen)p1=pmatchlen;
else p1=&len;
if(NULL!=pdata)p2=pdata;
else p2=&data;
*p1=0;
*p2=0;
return _isMatch(key,size,isFull,*p1,*p2,nodes);
}
};
CNode root;
vector<vector<CNode > > nodes;//第一个为空,代表所有未定义的
int key_size;
void _insert(char const * key,int size,int data)
{
//警告:必须确保所需的节点空间已经存在,否则引起重新分配导致内存混乱
if(nodes.capacity()-nodes.size()<size)nodes.reserve(nodes.size()+size);
root.Insert(key,size,data,nodes);
}
public:
CMatchTable2(int minnodes=1,int keysize=sizeof(long)):key_size(keysize)
{
if(minnodes<1)minnodes=1;
nodes.reserve(minnodes);
nodes.resize(1);
}
void clear()
{
root.clear();
}
void Insert(void const * key,int size,int data)
{
_insert((char const *)key,size,data);
}
void Print()const
{
thelog<<"节点数 "<<nodes.size()<<" 分配数 "<<nodes.capacity()<<endi;
long count=0;
long count2=0;
long count3=0;
for(int i=0;i<nodes.size();++i)
{
count+=nodes[i].size();
count2+=nodes[i].capacity();
for(int j=0;j<nodes[i].size();++j)
{
if(nodes[i][j].isItemFinish())++count3;
}
}
thelog<<"元素数 "<<count<<" 分配数 "<<count2<<endi;
thelog<<"项目数 "<<count3<<endi;
//root.Print(nodes);
}
bool isMatch(void const * key,int size,bool isFull,int * pmatchlen=NULL,int * pdata=NULL)const
{
return root.isMatch((char const *)key,size,isFull,pmatchlen,pdata,nodes);
}
void Insert(long key,int data)
{
static int const byteorder=1;
if(1==((char*)&byteorder)[0])_insert((char*)&key,key_size,data);
else _insert(((char*)&key)+sizeof(long)-key_size,key_size,data);
}
bool isMatch(long key,int * pdata=NULL)const
{
static int const byteorder=1;
if(1==((char*)&byteorder)[0])return root.isMatch((char*)&key,key_size,true,NULL,pdata,nodes);
else return root.isMatch(((char*)&key)+sizeof(long)-key_size,key_size,true,NULL,pdata,nodes);
}
};
class CTemp
{
public:
long a;
long b;
bool operator < (CTemp const & tmp)const{return a<tmp.a || a==tmp.a && b<tmp.b;}
};
void test()
{
CMatchTable2 matchtable(1000*10000);
CTemp tmp;
vector<CTemp > vdatas;
long i;
long maxi=100*10000;
thelog<<"start..."<<endi;
for(i=0;i<maxi;++i)
{
tmp.a=i;
tmp.b=i;
matchtable.Insert(&tmp,sizeof(CTemp),i);
cout<<"/r"<<i;
}
thelog<<"/r"<<"finish"<<endi;
thelog<<"start..."<<endi;
vdatas.reserve(maxi);
for(i=0;i<maxi;++i)
{
tmp.a=i;
tmp.b=i;
vdatas.push_back(tmp);
}
thelog<<"finish"<<endi;
matchtable.Print();
int data;
if(matchtable.isMatch(255,&data))cout<<__LINE__<<" "<<data<<endl;
if(matchtable.isMatch(256,&data))cout<<__LINE__<<" "<<data<<endl;
if(matchtable.isMatch(257,&data))cout<<__LINE__<<" "<<data<<endl;
thelog<<endi;
for(i=0;i<maxi;++i)
{
tmp.a=i;
tmp.b=i;
if(matchtable.isMatch(&tmp,sizeof(CTemp),true,NULL,&data))
{
}
else
{
thelog<<"not found "<<i<<endi;
}
}
thelog<<endi;
thelog<<endi;
for(i=0;i<maxi;++i)
{
tmp.a=i;
tmp.b=i;
if(lower_bound(vdatas.begin(),vdatas.end(),tmp)!=vdatas.end())
{
}
else
{
thelog<<"not found "<<i<<endi;
}
}
thelog<<endi;
cout<<sizeof(CMatchTable2)<<" "<<sizeof(CMatchTable2::CNode)<<" "<<sizeof(vector<CMatchTable2::CNode >)<<endl;
}