Apriori的c++代码实现

备注:1、若需要置信度判定,请继续完善getLk0()函数

2、此代码的优势是能处理数量级上万的数据,数量大时优势明显,若数据量太小则不合适,建议直接写暴力,反而可以省去大量数据整理时间。

#include <iostream>
#include <algorithm>
#include <fstream>
#include <cstring>
#include <sstream>
#include <vector>
#include <map>
#include <set>
using namespace std;

class Apriori{
private:
    string FileName;
    float minSup;
    map< long, set<string> > Database;
    map< string,set<int> > mp;   //string -- 所有行数
    void sub(set<string> &set1, set<string> &set2);
    set<string> add(set<string> &set1, set<string> &set2);
public:
    Apriori(string FileName, float MinSup){
        this->FileName = FileName;
        this->minSup = MinSup;
    }
    bool buildData();
    map< string, int> getC1();
    map< set<string>, int > getL1();
    set< set<string>> keySet(map<set<string>,int> &Lk);
    set< set<string> > aprioriGen(int m, set< set<string> > &);
    map< set<string>, int > getLk(int k, set<set<string>>);
    map< set<string>, int > getLk0(int k, set<set<string>>);
    int printsetSet(set< set<string> > &);
};

int line_num;

int Apriori::printsetSet(set<set<string>> &setSet) {
    int cnt=0;
    for(set< set<string> >::iterator it = setSet.begin(); it!=setSet.end(); it++,cnt++){
        set<string>::iterator It = (*it).begin();
        while(It != (*it).end()){
            cout<< *It++ <<" ";
        }
        cout<<endl;
    }
    return cnt;
}

//将数据存入Map,产生事务数据库(抄一遍data,形成map<long,set<string>>的数据结构)
bool Apriori::buildData() {
    ifstream inFile;
    inFile.open(FileName.c_str());
    if(!inFile){
        cout<<"文件"<<FileName<<"打开错误,请检查"<<endl;
        return EXIT_FAILURE;
    }
    string textline;
    vector<string> lines_of_text;
    getline(inFile,textline);//多读取一行文本,消除文件开头标号
    
    while(getline(inFile,textline))
        lines_of_text.push_back(textline);  //产生数据库
    for(line_num=0; line_num != lines_of_text.size(); line_num++){
        istringstream line(lines_of_text[line_num]);
        string word;
        while(line >> word){
            if(word.size()>1){
                Database[line_num].insert(word);
                mp[word].insert(line_num);
            }
        }
    }
    cout<<"项目总数: "<<Database.size()<<endl;
    return EXIT_SUCCESS;
}

//获取候选1项集:所有元素--个数:map<string,int>
map<string, int> Apriori::getC1() {
    map<string,int> C1;
    for( map< long,set<string> >::iterator mapIt = Database.begin(); mapIt != Database.end();){
        for(set<string>::iterator setIt = mapIt->second.begin(); setIt != mapIt->second.end(); ){
            pair<map<string,int>::iterator,bool> ret = C1.insert(make_pair(*setIt,1));  //没有true,有false
            if(!ret.second) //ans中已有setIt对应的元素
                ret.first->second++;
            setIt++;
        }
        mapIt++;
    }
    return C1;
}

//获取频繁1项集:频繁元素-个数:map<set<string>,int>
map<set<string>, int> Apriori::getL1() {
    if(minSup<1){
        minSup *= line_num;
        minSup = (int)minSup;
    }
    map<set<string>, int> L1;
    map<string, int> C1 = getC1();
    for(map<string,int>::iterator it=C1.begin(); it!=C1.end(); it++){
        if(it->second >= minSup){
            set<string> Key;
            Key.insert(it->first);  //这样写下一步才放得进去
            L1[Key] = it->second;
        }
    }
    return L1;
}

//获取map里所有频繁项集的串已经做成的set,再一起做成set
set< set<string> > Apriori::keySet(map<set<string>,int> &Lk) {
    set< set<string> > ans;
    for( map<set<string>,int>::iterator it = Lk.begin(); it!=Lk.end(); it++)
        ans.insert(it->first);
    return ans;
}

//集合差:set1-set2
void Apriori::sub(set<string> &set1, set<string> &set2) {
    for(set<string>::iterator it = set2.begin(); it != set2.end(); it++){
        set1.erase(*it);
        if(set1.size() == 0) break;
    }
}

//并集:set1+set2
set<string> Apriori::add(set<string> &set1, set<string> &set2) {
    set<string> ans(set1.begin(),set1.end());
    for(set<string>::iterator it = set2.begin(); it != set2.end(); it++)
        ans.insert(*it);
    return ans;
}

bool fun1(set<string> &a,set<string> &b){
    set<string> t=a;
    for(set<string>::iterator it=b.begin();it!=b.end();it++){
        t.erase(*it);
    }
    if(t.size()==1)
        return true;
    else
        return false;
}

//L(k-1)获取Ck
set< set<string> > Apriori::aprioriGen(int m, set<set<string>> &Lk0) {
    set< set<string> > Ck;
    for(set< set<string> >::iterator it=Lk0.begin(); it != Lk0.end(); ){
        set<string> Li = *it;
        for(set< set<string> >::iterator itr = ++it; itr != Lk0.end(); itr++){
            set<string> Lj = *itr;
            if(fun1(Li,Lj)){
                set<string> Ci = add(Li,Lj);
                if(Ci.size() == m+1)
                    if(!Lk0.count(Ci))
                        Ck.insert(Ci);
            }
        }
    }
    cout<<"完成"<<endl;
    return Ck;
}

set<int> intersec(set<int> &a,set<int> &b){
    set<int> t;
    set_intersection(a.begin(),a.end(), b.begin(),b.end(), inserter(t,t.begin()));
    return t;
}

//根据频繁k-1项集键集,获取频繁k项集
map< set<string>, int > Apriori::getLk(int k, set<set<string> > Lk0) {
    map< set<string>,int> Lk,Ck;
    set< set<string> > CkSet = aprioriGen(k-1, Lk0);    //Ck的set-string,再集合成set
    for(set< set<string> >::iterator i=CkSet.begin(); i!=CkSet.end(); i++){ //每一个set<string>
        string head = *(*i).begin();
        set<int> temp = mp[head];
        for(set<string>::iterator j = ++(*i).begin(); j!=(*i).end(); j++){
            set<int> t = mp[*j];    //j是set<string>里的每一个string,找到对应的int
            temp=intersec(temp,t);
        }
        if(temp.size()>=minSup)
            Lk[*i]=(int)temp.size();
    }
    return Lk;
}

//根据频繁k-1项集键集,获取频繁k项集,并统计每个候选的个数
map< set<string>, int > Apriori::getLk0(int k, set< set<string> > Lk0) {
    map< set<string>,int> Lk,Ck;
    set< set<string> > CkSet = aprioriGen(k-1, Lk0);    //生成候选k项集的set-string,再集合成set
    for(map<long, set<string> >::iterator i = Database.begin(); i != Database.end(); i++){
        set<string> strData = i->second;            //数据库第i项的string
        for(set< set<string> >::iterator j = CkSet.begin(); j != CkSet.end(); j++){
            set<string> strCk = *j, strTemp = *j;   //Ck的第j项string
            sub(strTemp,strData);
            if(strTemp.size() == 0){
                pair< map< set<string>, int >::iterator ,bool > ret = Ck.insert(make_pair(strCk,1));
                if(!ret.second)
                    ++ret.first->second;
            }
        }
    }
    for(map< set<string>, int>::iterator it = Ck.begin(); it != Ck.end(); it++)
        if(it->second >= minSup)
            Lk[it->first] = it->second;
    return Lk;
}

int main() {
    float min_sup;
    cout<<"请输入最小支持数/度:";
    cin>>min_sup;
    Apriori apriori("test.txt", min_sup);
    
    apriori.buildData();
    map<set<string>,int> L1 = apriori.getL1();
    set<set<string>> Set = apriori.keySet(L1); //频繁1项集的所有串,做成set
    map<int,set<set<string>> > L;
    L.insert(make_pair(1,Set));

    for(int k=2; ;k++){
        cout<<"k="<<k<<" ";
        map<set<string>,int> setLk = apriori.getLk(k,Set);  //获取频繁k项集的所有string和相应个数
        //map<set<string>,int> setLk = apriori.getLk0(k,Set);
        if(setLk.size()){
            Set = apriori.keySet(setLk);
            L.insert(make_pair(k,Set));
        }
        else break;
    }
    
    ofstream fcout;
    fcout.open("res.txt");
    //fcout.open("res_0.01.txt");
    int cnt=0;
    for(map<int, set<set<string>>>::iterator it = L.begin();it!=L.end();it++){
        cout<<"频繁"<<it->first <<"项集: "<<endl;
        fcout<<"频繁"<<it->first <<"项集: "<<endl;
        int n = apriori.printsetSet(it->second);
        for(set< set<string> >::iterator j = (it->second).begin(); j!=(it->second).end(); j++){
            set<string>::iterator It = (*j).begin();
            while(It != (*j).end()){
                fcout<< *It++ <<" ";
            }
            fcout<<endl;
        }
        cout<<"n="<<n<<endl;
        fcout<<"n="<<n<<endl;
        cnt += n;
    }
    cout<<"总数="<<cnt<<endl;
    fcout<<"总数="<<cnt<<endl;
    return 0;
}

测试输入test.txt内容(预留了一行空行,防止读取文件标识):


1 2 3 5
3 4 5
2 5 6 8
3 4 
2 8 3 2 1 9

评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值