从 retail.dat 文件中读取大量的数据,其中每一行代表一条交易记录。通过对每条交易记录建立HashTree来进行存储和快速访问,进而实现Apriori算法挖掘所有的频繁项集。结果输出包括所有的频繁项集和每个频繁项集对应的支持度。源码如下:
#include<sstream>
#include<fstream>
#include<cstdlib>
#include<cstring>
#include<vector>
#include<algorithm>
#include<iostream>
using namespace std;
const float minsup=0.02;
const int SIZE = 13;
const int p[6] = {2,3,5,7,11,13};
class HashNode
{
public:
HashNode();
HashNode(int key, int value);
int m_key; //结点关键字
int m_value; //结点数据对象
bool occupied; //结点是否被占据,如果是表示结点的关键字有效
HashNode *child[SIZE]; //结点的子结点数组
};
HashNode::HashNode()
{
occupied=false;
memset(child, NULL, SIZE*sizeof(HashNode*));
}
HashNode::HashNode(int key, int value)
{
m_key = key;
m_value = value;
occupied=false;
memset(child, NULL, SIZE*sizeof(HashNode*));
}
class HashTree
{
public:
HashTree();
void InsertNode(int key, int value);
bool FindNode(int key, int &value);
void DeleteNode(int key);
private:
HashNode *root;
void Insert(HashNode *hashNode, int level, int key, int value); //插入结点
bool Find(HashNode *hashNode, int level, int key, int &value); //查找
void Delete(HashNode *hashNode, int level,int key); //删除结点
};
HashTree::HashTree()
{
root = new HashNode();
}
void HashTree::InsertNode(int key, int value)
{
Insert(root,0,key,value);
}
void HashTree::Insert(HashNode *hashNode, int level, int key, int value)//插入结点
{
if(hashNode->occupied == false)
{
hashNode->m_key = key;
hashNode->m_value = value;
hashNode->occupied = true;
return;
}
int index = key%p[level];
if (hashNode->child[index] == NULL)
{
hashNode->child[index] = new HashNode();
}
level += 1;
Insert(hashNode->child[index], level, key, value);
}
bool HashTree::FindNode(int key, int &value)
{
return Find(root, 0, key, value);
}
bool HashTree::Find(HashNode *hashNode, int level, int key, int &value)//查找
{
if (hashNode->occupied == true)
{
if (hashNode->m_key == key)
{
value = hashNode->m_value;
return true;
}
}
int index = key%p[level];
if (hashNode->child[index] == NULL)
{
return false;
}
level += 1;
return Find(hashNode->child[index], level, key, value);
}
void HashTree::DeleteNode(int key)
{
Delete(root, 0, key);
}
void HashTree::Delete(HashNode *hashNode, int level, int key)//删除结点
{
if (hashNode->occupied == true)
{
if (hashNode->m_key == key)
{
hashNode->occupied = false;
return;
}
}
int index = key%p[level];
if (hashNode->child[index] == NULL)
{
return;
}
level += 1;
Delete(hashNode->child[index], level, key);
}
vector<vector<int>> Connect(vector<vector<int>> Lv)
{
vector<vector<int>> Ck;
vector<int> fir;
int num=0;
int length;
for(int i=0; i<Lv.size(); i++)
{
for(int j=i+1; j<Lv.size(); j++)
{
length=Lv[i].size();
if(length==1)//即此时为频繁一项集
{
fir.push_back(Lv[i][0]);//将元素两两合并
fir.push_back(Lv[j][0]);
Ck.push_back(fir);
fir.clear();
}
else
{
vector<int> res1;
set_intersection(Lv[i].begin(),Lv[i].end()-1,Lv[j].begin(),Lv[j].end()-1,back_inserter(res1));//求交集
if(res1.size()==length-1)
{
fir.assign(Lv[i].begin(), Lv[i].end());
fir.push_back(Lv[j][length-1]);
Ck.push_back(fir);//则项集合并
fir.clear();
}
res1.clear();
}
}
}
return Ck;
}
int main()
{
HashTree Hash[90000];
HashTree HashC1;
vector<int> VC1;
ifstream file;
file.open("retail.dat");
if(!file)
cout<<"error"<<endl;
string line;
int TreeNum=0,value=0;
while(getline(file,line)) //按行读取,遇到换行符结束
{
int temp,flag;
stringstream ss(line);
while(ss>>temp) //每次读取改行的一个数字
{
Hash[TreeNum].InsertNode(temp,1);//每条交易记录对应一个HashTree
if(TreeNum==0)
{
HashC1.InsertNode(temp,1);
VC1.push_back(temp);
}
if(TreeNum>0)
{
if(HashC1.FindNode(temp,value))
{
HashC1.DeleteNode(temp);
HashC1.InsertNode(temp,value+1);
}
else
{
HashC1.InsertNode(temp,1);
VC1.push_back(temp);
}
}
}
TreeNum++;
}
float flag;
//printf("请输入最小支持度:");
//cin>>minsup;
flag=minsup*TreeNum;
vector<int> VL1;
vector<int> s1;
for(int i=0; i<VC1.size(); i++)
{
HashC1.FindNode(VC1[i],value);
if(value>=flag)
{
VL1.push_back(VC1[i]);//用向量保存L1
value=0;
}
}
sort(VL1.begin(),VL1.end());//从小到大进行排序
int um=0;
for(int n:VL1)
{
HashC1.FindNode(n,um);
s1.push_back(um);
}
vector<vector<vector<int>>> result;//三维向量,保存结果的所有频繁项集
vector<vector<int>> second; //二维临时向量
vector<int> first; //一维临时向量
int temp=0;
for(int n:VL1)
{
first.push_back(n);
second.push_back(first);
first.clear();
}
result.push_back(second);
vector<vector<int>> Ck;
vector<vector<int>> Lk;
int k=0;
int Count;
int n,length;
vector<vector<int>> Sup;
Sup.push_back(s1);
vector<int> s;
while(result[k].size()>1)
{
Ck=Connect(result[k]);//二维向量
for(vector<int> temp:Ck)
{
Count=0;
length=temp.size();
for(int num=0; num<TreeNum; num++)
{
for(n=0; n<length; n++)
{
if(!Hash[num].FindNode(temp[n],value))
break;
}
if(n>=length)
Count++;
}
if(Count>=flag)
{
Lk.push_back(temp);//保存满足条件的Lk频繁项集
s.push_back(Count);
}
}
Sup.push_back(s);
result.push_back(Lk);
k++;
s.clear();
Lk.clear();
Ck.clear();
}
//打印挖掘到的所有频繁项集
int num=0;
for(int i=0; i<result.size(); i++)
{
if(result[i].size()>=1)
{
printf("频繁%d项集:\n",i+1);
for(int j=0; j<result[i].size(); j++)
{
int z;
num++;
printf("[{");
for(z=0; z<result[i][j].size(); z++)
{
if(z==result[i][j].size()-1)
printf("%d",result[i][j][z]);
else
printf("%d,",result[i][j][z]);
}
printf("}");
printf(":%d] ",Sup[i][j]);
}
printf("\n");
}
}
printf("一共有%d个频繁项",num);
}