备注:1、若需要置信度判定,请继续完善getLk0()函数
2、此代码的优势是能处理数量级上万的数据,数量大时优势明显,若数据量太小则不合适,建议直接写暴力,反而可以省去大量数据整理时间。
#include <iostream>
#include <algorithm>
#include <fstream>
#include <cstring>
#include <sstream>
#include <vector>
#include <map>
#include <set>
using namespace std;
class Apriori{
private:
string FileName;
float minSup;
map< long, set<string> > Database;
map< string,set<int> > mp; //string -- 所有行数
void sub(set<string> &set1, set<string> &set2);
set<string> add(set<string> &set1, set<string> &set2);
public:
Apriori(string FileName, float MinSup){
this->FileName = FileName;
this->minSup = MinSup;
}
bool buildData();
map< string, int> getC1();
map< set<string>, int > getL1();
set< set<string>> keySet(map<set<string>,int> &Lk);
set< set<string> > aprioriGen(int m, set< set<string> > &);
map< set<string>, int > getLk(int k, set<set<string>>);
map< set<string>, int > getLk0(int k, set<set<string>>);
int printsetSet(set< set<string> > &);
};
int line_num;
int Apriori::printsetSet(set<set<string>> &setSet) {
int cnt=0;
for(set< set<string> >::iterator it = setSet.begin(); it!=setSet.end(); it++,cnt++){
set<string>::iterator It = (*it).begin();
while(It != (*it).end()){
cout<< *It++ <<" ";
}
cout<<endl;
}
return cnt;
}
//将数据存入Map,产生事务数据库(抄一遍data,形成map<long,set<string>>的数据结构)
bool Apriori::buildData() {
ifstream inFile;
inFile.open(FileName.c_str());
if(!inFile){
cout<<"文件"<<FileName<<"打开错误,请检查"<<endl;
return EXIT_FAILURE;
}
string textline;
vector<string> lines_of_text;
getline(inFile,textline);//多读取一行文本,消除文件开头标号
while(getline(inFile,textline))
lines_of_text.push_back(textline); //产生数据库
for(line_num=0; line_num != lines_of_text.size(); line_num++){
istringstream line(lines_of_text[line_num]);
string word;
while(line >> word){
if(word.size()>1){
Database[line_num].insert(word);
mp[word].insert(line_num);
}
}
}
cout<<"项目总数: "<<Database.size()<<endl;
return EXIT_SUCCESS;
}
//获取候选1项集:所有元素--个数:map<string,int>
map<string, int> Apriori::getC1() {
map<string,int> C1;
for( map< long,set<string> >::iterator mapIt = Database.begin(); mapIt != Database.end();){
for(set<string>::iterator setIt = mapIt->second.begin(); setIt != mapIt->second.end(); ){
pair<map<string,int>::iterator,bool> ret = C1.insert(make_pair(*setIt,1)); //没有true,有false
if(!ret.second) //ans中已有setIt对应的元素
ret.first->second++;
setIt++;
}
mapIt++;
}
return C1;
}
//获取频繁1项集:频繁元素-个数:map<set<string>,int>
map<set<string>, int> Apriori::getL1() {
if(minSup<1){
minSup *= line_num;
minSup = (int)minSup;
}
map<set<string>, int> L1;
map<string, int> C1 = getC1();
for(map<string,int>::iterator it=C1.begin(); it!=C1.end(); it++){
if(it->second >= minSup){
set<string> Key;
Key.insert(it->first); //这样写下一步才放得进去
L1[Key] = it->second;
}
}
return L1;
}
//获取map里所有频繁项集的串已经做成的set,再一起做成set
set< set<string> > Apriori::keySet(map<set<string>,int> &Lk) {
set< set<string> > ans;
for( map<set<string>,int>::iterator it = Lk.begin(); it!=Lk.end(); it++)
ans.insert(it->first);
return ans;
}
//集合差:set1-set2
void Apriori::sub(set<string> &set1, set<string> &set2) {
for(set<string>::iterator it = set2.begin(); it != set2.end(); it++){
set1.erase(*it);
if(set1.size() == 0) break;
}
}
//并集:set1+set2
set<string> Apriori::add(set<string> &set1, set<string> &set2) {
set<string> ans(set1.begin(),set1.end());
for(set<string>::iterator it = set2.begin(); it != set2.end(); it++)
ans.insert(*it);
return ans;
}
bool fun1(set<string> &a,set<string> &b){
set<string> t=a;
for(set<string>::iterator it=b.begin();it!=b.end();it++){
t.erase(*it);
}
if(t.size()==1)
return true;
else
return false;
}
//L(k-1)获取Ck
set< set<string> > Apriori::aprioriGen(int m, set<set<string>> &Lk0) {
set< set<string> > Ck;
for(set< set<string> >::iterator it=Lk0.begin(); it != Lk0.end(); ){
set<string> Li = *it;
for(set< set<string> >::iterator itr = ++it; itr != Lk0.end(); itr++){
set<string> Lj = *itr;
if(fun1(Li,Lj)){
set<string> Ci = add(Li,Lj);
if(Ci.size() == m+1)
if(!Lk0.count(Ci))
Ck.insert(Ci);
}
}
}
cout<<"完成"<<endl;
return Ck;
}
set<int> intersec(set<int> &a,set<int> &b){
set<int> t;
set_intersection(a.begin(),a.end(), b.begin(),b.end(), inserter(t,t.begin()));
return t;
}
//根据频繁k-1项集键集,获取频繁k项集
map< set<string>, int > Apriori::getLk(int k, set<set<string> > Lk0) {
map< set<string>,int> Lk,Ck;
set< set<string> > CkSet = aprioriGen(k-1, Lk0); //Ck的set-string,再集合成set
for(set< set<string> >::iterator i=CkSet.begin(); i!=CkSet.end(); i++){ //每一个set<string>
string head = *(*i).begin();
set<int> temp = mp[head];
for(set<string>::iterator j = ++(*i).begin(); j!=(*i).end(); j++){
set<int> t = mp[*j]; //j是set<string>里的每一个string,找到对应的int
temp=intersec(temp,t);
}
if(temp.size()>=minSup)
Lk[*i]=(int)temp.size();
}
return Lk;
}
//根据频繁k-1项集键集,获取频繁k项集,并统计每个候选的个数
map< set<string>, int > Apriori::getLk0(int k, set< set<string> > Lk0) {
map< set<string>,int> Lk,Ck;
set< set<string> > CkSet = aprioriGen(k-1, Lk0); //生成候选k项集的set-string,再集合成set
for(map<long, set<string> >::iterator i = Database.begin(); i != Database.end(); i++){
set<string> strData = i->second; //数据库第i项的string
for(set< set<string> >::iterator j = CkSet.begin(); j != CkSet.end(); j++){
set<string> strCk = *j, strTemp = *j; //Ck的第j项string
sub(strTemp,strData);
if(strTemp.size() == 0){
pair< map< set<string>, int >::iterator ,bool > ret = Ck.insert(make_pair(strCk,1));
if(!ret.second)
++ret.first->second;
}
}
}
for(map< set<string>, int>::iterator it = Ck.begin(); it != Ck.end(); it++)
if(it->second >= minSup)
Lk[it->first] = it->second;
return Lk;
}
int main() {
float min_sup;
cout<<"请输入最小支持数/度:";
cin>>min_sup;
Apriori apriori("test.txt", min_sup);
apriori.buildData();
map<set<string>,int> L1 = apriori.getL1();
set<set<string>> Set = apriori.keySet(L1); //频繁1项集的所有串,做成set
map<int,set<set<string>> > L;
L.insert(make_pair(1,Set));
for(int k=2; ;k++){
cout<<"k="<<k<<" ";
map<set<string>,int> setLk = apriori.getLk(k,Set); //获取频繁k项集的所有string和相应个数
//map<set<string>,int> setLk = apriori.getLk0(k,Set);
if(setLk.size()){
Set = apriori.keySet(setLk);
L.insert(make_pair(k,Set));
}
else break;
}
ofstream fcout;
fcout.open("res.txt");
//fcout.open("res_0.01.txt");
int cnt=0;
for(map<int, set<set<string>>>::iterator it = L.begin();it!=L.end();it++){
cout<<"频繁"<<it->first <<"项集: "<<endl;
fcout<<"频繁"<<it->first <<"项集: "<<endl;
int n = apriori.printsetSet(it->second);
for(set< set<string> >::iterator j = (it->second).begin(); j!=(it->second).end(); j++){
set<string>::iterator It = (*j).begin();
while(It != (*j).end()){
fcout<< *It++ <<" ";
}
fcout<<endl;
}
cout<<"n="<<n<<endl;
fcout<<"n="<<n<<endl;
cnt += n;
}
cout<<"总数="<<cnt<<endl;
fcout<<"总数="<<cnt<<endl;
return 0;
}
测试输入test.txt内容(预留了一行空行,防止读取文件标识):
1 2 3 5
3 4 5
2 5 6 8
3 4
2 8 3 2 1 9