Apriori算法是R.Agrawal和R.Srikant于1994年提出的为布尔关联规则挖掘频繁项集的原创性质算法。正如我们将看到的,算法的名字基于这样的事实:算法使用频繁项集性质的先验性质。Apriori使用一种称作逐层搜索的迭代方法,k项集用于探索(k+1)项集。首先,通过扫描数据库,累积每个项的计数,并收集满足最小支持度的项,找出频繁1项集的集合。该集合记作L1。然后L1用于找频繁2项集的集合L2,L2用于找L3,如此下去,知道不能在找到频繁项集k项集。找每个Lk需要一次数据库全扫描。
图1
代码
#include "stdafx.h"
#include<iostream>
#include<fstream>
#include<vector>
#include<string>
#include<map>
using namespace std;
typedef struct Item//只有一个词的频繁项
{
string sItem;
int iSupport;
}ITEM;
typedef vector<string> VEC_STR;
typedef vector<VEC_STR> VEC_VEC_STR;
typedef struct MultiItem//高层的频繁项
{
VEC_STR vsItem;
int iSupport;
}MULTIITEM;
typedef vector<ITEM> VEC_ITEM;//只有一个词的频繁项集合
typedef vector<MULTIITEM> VEC_MULTIITEM;//高层的频繁项集合
typedef map<string, int> MAP_STR_INT;//存储词语及其出现频率
void readFile(ifstream &, const string &, VEC_STR &);
void countWord(VEC_STR *, MAP_STR_INT &, const char separator='\\');
void generateLevel1Set(MAP_STR_INT *, VEC_ITEM &);
void generateLevel2(VEC_ITEM *, VEC_MULTIITEM &);
void cycGenerator(VEC_MULTIITEM *, VEC_STR &, ofstream &);
void generateHighLevelSet(VEC_MULTIITEM *, VEC_MULTIITEM &, VEC_STR &);
void generateInitialHigh(VEC_MULTIITEM *, VEC_VEC_STR &);
void pruning(VEC_VEC_STR *, VEC_MULTIITEM *, VEC_MULTIITEM &);
bool find(VEC_MULTIITEM *, VEC_STR *);
void countSupport(VEC_STR *, VEC_MULTIITEM &);
void generateFrequentSet(VEC_MULTIITEM *, VEC_MULTIITEM &);
void printFrequentSet(VEC_ITEM *, ostream &os=cout);
void printFrequentSet(VEC_MULTIITEM *, ostream &os=cout);
const int MINSUPPORT = 2;//最小支持度
int main()
{
//从源文件读取数据
ifstream infile;
VEC_STR vs_word;
readFile(infile,"input.txt",vs_word);
infile.close();
//计算所有词语的出现频率
MAP_STR_INT word_count;
countWord(&vs_word, word_count);
//生成单个词语的频繁项集合
VEC_ITEM level1Set;
generateLevel1Set(&word_count, level1Set);
//生成具有两个词语的频繁项集合
VEC_MULTIITEM level2, level2Set;
generateLevel2(&level1Set, level2);
countSupport(&vs_word, level2);
generateFrequentSet(&level2, level2Set);
//生成具有三个词语的频繁项集合
VEC_MULTIITEM level3Set;
generateHighLevelSet(&level2Set, level3Set, vs_word);
//输出单个词的频繁项到文件
ofstream outfile;
outfile.open("out.txt");
if(!outfile)
cout<<"不能打开文件!"<<endl;
printFrequentSet(&level1Set, outfile);
//循环产生高层的频繁项集合并输出到文件
cycGenerator(&level2Set, vs_word, outfile);
cout<<"OK!"<<endl;
return 0;
}
/**从源文件读取词语
*每一行作为一个字符串存入向量中
*/
void readFile(ifstream &infile, const string &filename, VEC_STR &vs_word)
{
infile.close();
infile.clear();
infile.open(filename.c_str());
if(!infile)
cout<<"Unable to open this file!"<<endl;
string word;
while(getline(infile, word))
vs_word.push_back(word);
}
/**计算每个词语的支持度
*从字符串中提取出所有词语,与其支持度一道存入map中
*/
void countWord(VEC_STR *vs_word, MAP_STR_INT &word_count, const char separator)
{
string sentence,word;
for(unsigned int i=0; i<vs_word->size(); ++i)
{
sentence = (*vs_word)[i];
while(sentence.find(separator)!=-1)
{
word = sentence.substr(0,sentence.find(separator));
++word_count[word];
sentence = sentence.substr(sentence.find(separator)+1, sentence.size()-1);
}
++word_count[sentence];
}
}
/**找出频繁1项集的集合
*/
void generateLevel1Set(MAP_STR_INT *pWord_Count, VEC_ITEM &level1Set)
{
ITEM item;
MAP_STR_INT::const_iterator map_it = pWord_Count->begin();
while(map_it != pWord_Count->end())
{
if(map_it->second >= MINSUPPORT)
{
item.sItem = map_it->first;
item.iSupport = map_it->second;
level1Set.push_back(item);
}
++map_it;
}
}
/**由频繁1项集生成初始2项集
*/
void generateLevel2(VEC_ITEM *pLevel1Set, VEC_MULTIITEM &initialLevel2)
{
VEC_STR vsTemp;
MULTIITEM multiTemp;
unsigned int level1SetSize = pLevel1Set->size();
for(unsigned int i=0; i<level1SetSize-1; ++i)
{
vsTemp.push_back((*pLevel1Set)[i].sItem);
for(unsigned int j=i+1; j<level1SetSize; ++j)
{
vsTemp.push_back((*pLevel1Set)[j].sItem);
multiTemp.vsItem = vsTemp;
multiTemp.iSupport = 0;
initialLevel2.push_back(multiTemp);
vsTemp.pop_back();
}
vsTemp.clear();
}
}
/**循环产生频繁项集合并输出
*/
void cycGenerator(VEC_MULTIITEM *pLowLevelSet, VEC_STR &vs_word, ofstream &os)
{
VEC_MULTIITEM highLevelSet, setTemp;
printFrequentSet(pLowLevelSet, os);
while(pLowLevelSet->size() != 0)
{
setTemp.clear();
generateHighLevelSet(pLowLevelSet, setTemp, vs_word);
highLevelSet = setTemp;
printFrequentSet(&highLevelSet, os);
pLowLevelSet = &highLevelSet;
}
}
/**由低层的频繁项集生成高层的频繁项集合
*/
void generateHighLevelSet(VEC_MULTIITEM *pLowLevelSet, VEC_MULTIITEM &highLevelSet, VEC_STR &vs_word)
{
VEC_VEC_STR vvsTemp;
VEC_MULTIITEM vmiTemp;
generateInitialHigh(pLowLevelSet, vvsTemp);
pruning(&vvsTemp, pLowLevelSet, vmiTemp);
countSupport(&vs_word, vmiTemp);
generateFrequentSet(&vmiTemp, highLevelSet);
}
/**从低层的频繁项集生成初始的高层项集合
*/
void generateInitialHigh(VEC_MULTIITEM *pLowLevelSet, VEC_VEC_STR &highLevelSet)
{
VEC_STR vsTemp;
unsigned int level1SetSize = pLowLevelSet->size();
for(unsigned int i=0; i<level1SetSize-1; ++i)
for(unsigned int j=i+1; j<level1SetSize; ++j)
{
unsigned int k = 0;
for(; k<(*pLowLevelSet)[i].vsItem.size()-1; ++k)
{
if((*pLowLevelSet)[i].vsItem[k] == (*pLowLevelSet)[j].vsItem[k])
vsTemp.push_back((*pLowLevelSet)[i].vsItem[k]);
else
break;
}
if(k == (*pLowLevelSet)[i].vsItem.size()-1)
{
vsTemp.push_back((*pLowLevelSet)[i].vsItem[k]);
vsTemp.push_back((*pLowLevelSet)[j].vsItem[k]);
highLevelSet.push_back(vsTemp);
}
vsTemp.clear();
}
}
/**剪枝步
*/
void pruning(VEC_VEC_STR *pInitialSet, VEC_MULTIITEM *pLowLevelSet, VEC_MULTIITEM &prunedSet)
{
VEC_STR vsTemp;
MULTIITEM miTemp;
for(unsigned int i=0; i<pInitialSet->size(); ++i)
{
unsigned int j = 0;
unsigned int sizeI = (*pInitialSet)[i].size();
for(; j<sizeI; ++j)
{
for(unsigned int k=0; k<sizeI; ++k)
if(k!=j)
vsTemp.push_back((*pInitialSet)[i][k]);
if(!find(pLowLevelSet, &vsTemp))
break;
}
if(j==sizeI)
{
miTemp.vsItem = (*pInitialSet)[i];
miTemp.iSupport = 0;
prunedSet.push_back(miTemp);
}
vsTemp.clear();
}
}
/**在低层的频繁项集中查询高层的初始频繁项的所有子集的函数
*/
bool find(VEC_MULTIITEM *pLowSet, VEC_STR *pSubSet)
{
for(unsigned int i=0; i<pLowSet->size(); ++i)
{
unsigned int j = 0;
unsigned int sizeI = (*pLowSet)[i].vsItem.size();
for(; j<sizeI; ++j)
if((*pLowSet)[i].vsItem[j] != (*pSubSet)[j])
break;
if(j==sizeI)
return true;
}
return false;
}
/**计算生成的初始频繁项集中各项的支持度
*/
void countSupport(VEC_STR *pVs_Word, VEC_MULTIITEM &initialSet)
{
int flag;
for(unsigned int i=0; i<pVs_Word->size(); ++i)
for(unsigned int j=0; j<initialSet.size(); ++j)
{
flag =1;
for(unsigned int k=0; k<initialSet[j].vsItem.size(); ++k)
{
if((*pVs_Word)[i].find(initialSet[j].vsItem[k], 0) == -1)
{
flag = 0;
break;
}
}
if(flag ==1)
++initialSet[j].iSupport;
}
}
/**从初始项集合中提取出频繁项集合
*/
void generateFrequentSet(VEC_MULTIITEM *pInitialSet, VEC_MULTIITEM &frequentSet)
{
for(unsigned int i=0; i<pInitialSet->size(); ++i)
{
if((*pInitialSet)[i].iSupport >= MINSUPPORT)
{
frequentSet.push_back((*pInitialSet)[i]);
}
}
}
/**打印一项频繁集合
*/
void printFrequentSet(VEC_ITEM *pLevel1Set, ostream &os)
{
//os<<"频繁项";
//os<<"支持度\n"<<endl;
for(unsigned int i=0; i<pLevel1Set->size(); ++i)
{
os<<(*pLevel1Set)[i].sItem<<":\t"<<(*pLevel1Set)[i].iSupport<<endl;
}
os<<endl;
}
/**打印高层频繁项集合
*/
void printFrequentSet(VEC_MULTIITEM *pFrequentSet, ostream &os)
{
for(unsigned int i=0; i<pFrequentSet->size(); ++i)
{
unsigned int j=0;
for(; j<(*pFrequentSet)[i].vsItem.size()-1; ++j)
os<<(*pFrequentSet)[i].vsItem[j]<<"&";
os<<(*pFrequentSet)[i].vsItem[j]<<":\t"<<(*pFrequentSet)[i].iSupport<<endl;
}
os<<endl;
}
Apriori算法