Apriori算法之HashTree实现 C++

theMuseCatcher

已于 2024-04-08 09:33:06 修改

阅读量1k

点赞数 5

文章标签： HashTree Apriori C++

于 2018-12-31 21:20:55 首次发布

本文链接：https://blog.csdn.net/Dandrose/article/details/85081121

版权

从 retail.dat 文件中读取大量的数据，其中每一行代表一条交易记录。通过对每条交易记录建立HashTree来进行存储和快速访问，进而实现Apriori算法挖掘所有的频繁项集。结果输出包括所有的频繁项集和每个频繁项集对应的支持度。源码如下：

#include<sstream>
#include<fstream>
#include<cstdlib>
#include<cstring>
#include<vector>
#include<algorithm>
#include<iostream>
using namespace std;
const float minsup=0.02;
const int SIZE = 13;
const int p[6] = {2,3,5,7,11,13};
class HashNode
{
public:
    HashNode();
    HashNode(int key, int value);
    int m_key;    //结点关键字
    int m_value;      //结点数据对象
    bool occupied;      //结点是否被占据，如果是表示结点的关键字有效
    HashNode *child[SIZE];     //结点的子结点数组
};
HashNode::HashNode()
{
    occupied=false;
    memset(child, NULL, SIZE*sizeof(HashNode*));
}
HashNode::HashNode(int key, int value)
{
    m_key = key;
    m_value = value;
    occupied=false;
    memset(child, NULL, SIZE*sizeof(HashNode*));
}
class HashTree
{
public:
    HashTree();
    void InsertNode(int key, int value);
    bool FindNode(int key, int &value);
    void DeleteNode(int key);
private:
    HashNode *root;
    void Insert(HashNode *hashNode, int level, int key, int value);   //插入结点
    bool Find(HashNode *hashNode, int level, int key, int &value);   //查找
    void Delete(HashNode *hashNode, int level,int key);    //删除结点
};
HashTree::HashTree()
{
    root = new HashNode();
}
void HashTree::InsertNode(int key, int value)
{
    Insert(root,0,key,value);
}
void HashTree::Insert(HashNode *hashNode, int level, int key, int value)//插入结点
{
    if(hashNode->occupied == false)
    {
        hashNode->m_key = key;
        hashNode->m_value = value;
        hashNode->occupied = true;
        return;
    }
    int index = key%p[level];
    if (hashNode->child[index] == NULL)
    {
        hashNode->child[index] = new HashNode();
    }
    level += 1;
    Insert(hashNode->child[index], level, key, value);
}
bool HashTree::FindNode(int key, int &value)
{
    return Find(root, 0, key, value);
}
bool HashTree::Find(HashNode *hashNode, int level, int key, int &value)//查找
{
    if (hashNode->occupied == true)
    {
        if (hashNode->m_key == key)
        {
            value = hashNode->m_value;
            return true;
        }
    }
    int index = key%p[level];
    if (hashNode->child[index] == NULL)
    {
        return false;
    }
    level += 1;
    return Find(hashNode->child[index], level, key, value);
}
void HashTree::DeleteNode(int key)
{
    Delete(root, 0, key);
}
void HashTree::Delete(HashNode *hashNode, int level, int key)//删除结点
{
    if (hashNode->occupied == true)
    {
        if (hashNode->m_key == key)
        {
            hashNode->occupied = false;
            return;
        }
    }
    int index = key%p[level];
    if (hashNode->child[index] == NULL)
    {
        return;
    }
    level += 1;
    Delete(hashNode->child[index], level, key);
}
vector<vector<int>> Connect(vector<vector<int>> Lv)
{
    vector<vector<int>> Ck;
    vector<int> fir;
    int num=0;
    int length;
    for(int i=0; i<Lv.size(); i++)
    {
        for(int j=i+1; j<Lv.size(); j++)
        {
            length=Lv[i].size();
            if(length==1)//即此时为频繁一项集
            {
                fir.push_back(Lv[i][0]);//将元素两两合并
                fir.push_back(Lv[j][0]);
                Ck.push_back(fir);
                fir.clear();
            }
            else
            {
                vector<int> res1;
                set_intersection(Lv[i].begin(),Lv[i].end()-1,Lv[j].begin(),Lv[j].end()-1,back_inserter(res1));//求交集
                if(res1.size()==length-1)
                {
                    fir.assign(Lv[i].begin(), Lv[i].end());
                    fir.push_back(Lv[j][length-1]);
                    Ck.push_back(fir);//则项集合并
                    fir.clear();
                }
                res1.clear();
            }
        }
    }
    return Ck;
}
int main()
{
    HashTree Hash[90000];
    HashTree HashC1;
    vector<int> VC1;
    ifstream file;
    file.open("retail.dat");
    if(!file)
        cout<<"error"<<endl;
    string line;
    int TreeNum=0,value=0;
    while(getline(file,line))   //按行读取,遇到换行符结束
    {
        int temp,flag;
        stringstream ss(line);
        while(ss>>temp)    //每次读取改行的一个数字
        {
            Hash[TreeNum].InsertNode(temp,1);//每条交易记录对应一个HashTree
            if(TreeNum==0)
            {
                HashC1.InsertNode(temp,1);
                VC1.push_back(temp);
            }
            if(TreeNum>0)
            {
                if(HashC1.FindNode(temp,value))
                {
                    HashC1.DeleteNode(temp);
                    HashC1.InsertNode(temp,value+1);
                }
                else
                {
                    HashC1.InsertNode(temp,1);
                    VC1.push_back(temp);
                }
            }
        }
        TreeNum++;
    }
    float flag;
    //printf("请输入最小支持度：");
    //cin>>minsup;
    flag=minsup*TreeNum;
    vector<int> VL1;
    vector<int> s1;
    for(int i=0; i<VC1.size(); i++)
    {
        HashC1.FindNode(VC1[i],value);
        if(value>=flag)
        {
            VL1.push_back(VC1[i]);//用向量保存L1
            value=0;
        }
    }
    sort(VL1.begin(),VL1.end());//从小到大进行排序
    int um=0;
    for(int n:VL1)
    {
        HashC1.FindNode(n,um);
        s1.push_back(um);
    }
    vector<vector<vector<int>>> result;//三维向量，保存结果的所有频繁项集
    vector<vector<int>> second; //二维临时向量
    vector<int> first; //一维临时向量
    int temp=0;
    for(int n:VL1)
    {
        first.push_back(n);
        second.push_back(first);
        first.clear();
    }
    result.push_back(second);
    vector<vector<int>> Ck;
    vector<vector<int>> Lk;
    int k=0;
    int Count;
    int n,length;
    vector<vector<int>> Sup;
    Sup.push_back(s1);
    vector<int> s;
    while(result[k].size()>1)
    {
        Ck=Connect(result[k]);//二维向量
        for(vector<int> temp:Ck)
        {
            Count=0;
            length=temp.size();
            for(int num=0; num<TreeNum; num++)
            {
                for(n=0; n<length; n++)
                {
                    if(!Hash[num].FindNode(temp[n],value))
                        break;
                }
                if(n>=length)
                    Count++;
            }
            if(Count>=flag)
            {
                Lk.push_back(temp);//保存满足条件的Lk频繁项集
                s.push_back(Count);
            }
        }
        Sup.push_back(s);
        result.push_back(Lk);
        k++;
        s.clear();
        Lk.clear();
        Ck.clear();
    }
    //打印挖掘到的所有频繁项集
    int num=0;
    for(int i=0; i<result.size(); i++)
    {
        if(result[i].size()>=1)
        {
            printf("频繁%d项集：\n",i+1);
            for(int j=0; j<result[i].size(); j++)
            {
                int z;
                num++;
                printf("[{");
                for(z=0; z<result[i][j].size(); z++)
                {
                    if(z==result[i][j].size()-1)
                        printf("%d",result[i][j][z]);
                    else
                        printf("%d,",result[i][j][z]);
                }
                printf("}");
                printf(":%d] ",Sup[i][j]);
            }
            printf("\n");
        }
    }
    printf("一共有%d个频繁项",num);
}